AmosLewis · January 23, 2025 16:56
diff --git a/llama_8b_fp8.mlir b/llama_8b_fp8.mlir
 module @module {
  util.global private @__auto.token_embd.weight = #stream.parameter.named<"model"::"token_embd.weight"> : tensor<128256x4096xbf16>
  util.global private @__auto.blk.0.attn_norm.weight = #stream.parameter.named<"model"::"blk.0.attn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.0.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.0.attn_q.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.0.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.0.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.0.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.0.attn_k.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.0.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.0.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.0.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.0.attn_v.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.0.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.0.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.0.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.0.kv_cache.quantizer:rscale"> : tensor<f32>
  util.global private @"__auto.blk.0.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.0.attn_output.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.0.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.0.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @__auto.blk.0.ffn_norm.weight = #stream.parameter.named<"model"::"blk.0.ffn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.0.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.0.ffn_gate.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.0.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.0.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.0.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.0.ffn_up.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.0.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.0.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.0.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.0.ffn_down.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.0.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.0.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
  util.global private @__auto.blk.1.attn_norm.weight = #stream.parameter.named<"model"::"blk.1.attn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.1.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.1.attn_q.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.1.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.1.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.1.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.1.attn_k.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.1.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.1.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.1.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.1.attn_v.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.1.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.1.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.1.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.1.kv_cache.quantizer:rscale"> : tensor<f32>
  util.global private @"__auto.blk.1.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.1.attn_output.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.1.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.1.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @__auto.blk.1.ffn_norm.weight = #stream.parameter.named<"model"::"blk.1.ffn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.1.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.1.ffn_gate.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.1.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.1.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.1.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.1.ffn_up.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.1.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.1.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.1.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.1.ffn_down.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.1.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.1.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
  util.global private @__auto.blk.2.attn_norm.weight = #stream.parameter.named<"model"::"blk.2.attn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.2.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.2.attn_q.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.2.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.2.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.2.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.2.attn_k.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.2.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.2.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.2.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.2.attn_v.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.2.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.2.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.2.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.2.kv_cache.quantizer:rscale"> : tensor<f32>
  util.global private @"__auto.blk.2.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.2.attn_output.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.2.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.2.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @__auto.blk.2.ffn_norm.weight = #stream.parameter.named<"model"::"blk.2.ffn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.2.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.2.ffn_gate.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.2.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.2.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.2.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.2.ffn_up.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.2.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.2.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.2.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.2.ffn_down.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.2.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.2.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
  util.global private @__auto.blk.3.attn_norm.weight = #stream.parameter.named<"model"::"blk.3.attn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.3.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.3.attn_q.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.3.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.3.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.3.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.3.attn_k.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.3.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.3.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.3.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.3.attn_v.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.3.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.3.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.3.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.3.kv_cache.quantizer:rscale"> : tensor<f32>
  util.global private @"__auto.blk.3.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.3.attn_output.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.3.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.3.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @__auto.blk.3.ffn_norm.weight = #stream.parameter.named<"model"::"blk.3.ffn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.3.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.3.ffn_gate.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.3.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.3.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.3.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.3.ffn_up.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.3.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.3.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.3.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.3.ffn_down.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.3.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.3.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
  util.global private @__auto.blk.4.attn_norm.weight = #stream.parameter.named<"model"::"blk.4.attn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.4.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.4.attn_q.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.4.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.4.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.4.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.4.attn_k.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.4.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.4.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.4.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.4.attn_v.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.4.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.4.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.4.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.4.kv_cache.quantizer:rscale"> : tensor<f32>
  util.global private @"__auto.blk.4.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.4.attn_output.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.4.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.4.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @__auto.blk.4.ffn_norm.weight = #stream.parameter.named<"model"::"blk.4.ffn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.4.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.4.ffn_gate.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.4.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.4.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.4.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.4.ffn_up.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.4.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.4.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.4.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.4.ffn_down.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.4.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.4.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
  util.global private @__auto.blk.5.attn_norm.weight = #stream.parameter.named<"model"::"blk.5.attn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.5.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.5.attn_q.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.5.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.5.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.5.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.5.attn_k.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.5.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.5.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.5.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.5.attn_v.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.5.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.5.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.5.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.5.kv_cache.quantizer:rscale"> : tensor<f32>
  util.global private @"__auto.blk.5.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.5.attn_output.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.5.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.5.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @__auto.blk.5.ffn_norm.weight = #stream.parameter.named<"model"::"blk.5.ffn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.5.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.5.ffn_gate.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.5.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.5.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.5.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.5.ffn_up.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.5.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.5.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.5.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.5.ffn_down.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.5.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.5.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
  util.global private @__auto.blk.6.attn_norm.weight = #stream.parameter.named<"model"::"blk.6.attn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.6.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.6.attn_q.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.6.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.6.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.6.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.6.attn_k.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.6.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.6.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.6.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.6.attn_v.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.6.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.6.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.6.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.6.kv_cache.quantizer:rscale"> : tensor<f32>
  util.global private @"__auto.blk.6.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.6.attn_output.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.6.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.6.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @__auto.blk.6.ffn_norm.weight = #stream.parameter.named<"model"::"blk.6.ffn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.6.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.6.ffn_gate.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.6.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.6.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.6.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.6.ffn_up.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.6.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.6.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.6.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.6.ffn_down.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.6.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.6.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
  util.global private @__auto.blk.7.attn_norm.weight = #stream.parameter.named<"model"::"blk.7.attn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.7.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.7.attn_q.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.7.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.7.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.7.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.7.attn_k.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.7.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.7.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.7.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.7.attn_v.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.7.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.7.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.7.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.7.kv_cache.quantizer:rscale"> : tensor<f32>
  util.global private @"__auto.blk.7.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.7.attn_output.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.7.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.7.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @__auto.blk.7.ffn_norm.weight = #stream.parameter.named<"model"::"blk.7.ffn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.7.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.7.ffn_gate.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.7.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.7.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.7.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.7.ffn_up.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.7.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.7.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.7.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.7.ffn_down.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.7.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.7.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
  util.global private @__auto.blk.8.attn_norm.weight = #stream.parameter.named<"model"::"blk.8.attn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.8.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.8.attn_q.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.8.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.8.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.8.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.8.attn_k.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.8.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.8.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.8.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.8.attn_v.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.8.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.8.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.8.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.8.kv_cache.quantizer:rscale"> : tensor<f32>
  util.global private @"__auto.blk.8.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.8.attn_output.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.8.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.8.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @__auto.blk.8.ffn_norm.weight = #stream.parameter.named<"model"::"blk.8.ffn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.8.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.8.ffn_gate.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.8.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.8.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.8.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.8.ffn_up.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.8.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.8.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.8.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.8.ffn_down.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.8.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.8.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
  util.global private @__auto.blk.9.attn_norm.weight = #stream.parameter.named<"model"::"blk.9.attn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.9.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.9.attn_q.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.9.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.9.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.9.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.9.attn_k.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.9.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.9.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.9.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.9.attn_v.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.9.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.9.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.9.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.9.kv_cache.quantizer:rscale"> : tensor<f32>
  util.global private @"__auto.blk.9.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.9.attn_output.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.9.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.9.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @__auto.blk.9.ffn_norm.weight = #stream.parameter.named<"model"::"blk.9.ffn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.9.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.9.ffn_gate.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.9.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.9.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.9.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.9.ffn_up.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.9.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.9.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.9.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.9.ffn_down.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.9.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.9.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
  util.global private @__auto.blk.10.attn_norm.weight = #stream.parameter.named<"model"::"blk.10.attn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.10.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.10.attn_q.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.10.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.10.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.10.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.10.attn_k.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.10.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.10.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.10.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.10.attn_v.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.10.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.10.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.10.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.10.kv_cache.quantizer:rscale"> : tensor<f32>
  util.global private @"__auto.blk.10.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.10.attn_output.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.10.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.10.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @__auto.blk.10.ffn_norm.weight = #stream.parameter.named<"model"::"blk.10.ffn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.10.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.10.ffn_gate.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.10.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.10.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.10.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.10.ffn_up.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.10.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.10.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.10.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.10.ffn_down.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.10.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.10.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
  util.global private @__auto.blk.11.attn_norm.weight = #stream.parameter.named<"model"::"blk.11.attn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.11.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.11.attn_q.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.11.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.11.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.11.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.11.attn_k.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.11.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.11.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.11.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.11.attn_v.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.11.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.11.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.11.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.11.kv_cache.quantizer:rscale"> : tensor<f32>
  util.global private @"__auto.blk.11.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.11.attn_output.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.11.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.11.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @__auto.blk.11.ffn_norm.weight = #stream.parameter.named<"model"::"blk.11.ffn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.11.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.11.ffn_gate.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.11.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.11.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.11.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.11.ffn_up.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.11.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.11.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.11.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.11.ffn_down.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.11.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.11.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
  util.global private @__auto.blk.12.attn_norm.weight = #stream.parameter.named<"model"::"blk.12.attn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.12.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.12.attn_q.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.12.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.12.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.12.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.12.attn_k.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.12.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.12.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.12.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.12.attn_v.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.12.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.12.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.12.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.12.kv_cache.quantizer:rscale"> : tensor<f32>
  util.global private @"__auto.blk.12.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.12.attn_output.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.12.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.12.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @__auto.blk.12.ffn_norm.weight = #stream.parameter.named<"model"::"blk.12.ffn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.12.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.12.ffn_gate.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.12.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.12.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.12.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.12.ffn_up.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.12.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.12.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.12.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.12.ffn_down.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.12.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.12.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
  util.global private @__auto.blk.13.attn_norm.weight = #stream.parameter.named<"model"::"blk.13.attn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.13.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.13.attn_q.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.13.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.13.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.13.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.13.attn_k.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.13.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.13.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.13.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.13.attn_v.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.13.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.13.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.13.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.13.kv_cache.quantizer:rscale"> : tensor<f32>
  util.global private @"__auto.blk.13.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.13.attn_output.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.13.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.13.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @__auto.blk.13.ffn_norm.weight = #stream.parameter.named<"model"::"blk.13.ffn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.13.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.13.ffn_gate.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.13.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.13.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.13.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.13.ffn_up.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.13.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.13.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.13.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.13.ffn_down.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.13.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.13.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
  util.global private @__auto.blk.14.attn_norm.weight = #stream.parameter.named<"model"::"blk.14.attn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.14.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.14.attn_q.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.14.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.14.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.14.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.14.attn_k.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.14.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.14.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.14.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.14.attn_v.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.14.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.14.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.14.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.14.kv_cache.quantizer:rscale"> : tensor<f32>
  util.global private @"__auto.blk.14.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.14.attn_output.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.14.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.14.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @__auto.blk.14.ffn_norm.weight = #stream.parameter.named<"model"::"blk.14.ffn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.14.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.14.ffn_gate.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.14.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.14.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.14.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.14.ffn_up.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.14.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.14.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.14.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.14.ffn_down.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.14.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.14.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
  util.global private @__auto.blk.15.attn_norm.weight = #stream.parameter.named<"model"::"blk.15.attn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.15.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.15.attn_q.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.15.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.15.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.15.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.15.attn_k.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.15.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.15.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.15.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.15.attn_v.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.15.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.15.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.15.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.15.kv_cache.quantizer:rscale"> : tensor<f32>
  util.global private @"__auto.blk.15.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.15.attn_output.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.15.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.15.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @__auto.blk.15.ffn_norm.weight = #stream.parameter.named<"model"::"blk.15.ffn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.15.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.15.ffn_gate.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.15.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.15.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.15.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.15.ffn_up.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.15.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.15.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.15.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.15.ffn_down.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.15.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.15.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
  util.global private @__auto.blk.16.attn_norm.weight = #stream.parameter.named<"model"::"blk.16.attn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.16.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.16.attn_q.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.16.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.16.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.16.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.16.attn_k.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.16.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.16.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.16.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.16.attn_v.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.16.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.16.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.16.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.16.kv_cache.quantizer:rscale"> : tensor<f32>
  util.global private @"__auto.blk.16.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.16.attn_output.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.16.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.16.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @__auto.blk.16.ffn_norm.weight = #stream.parameter.named<"model"::"blk.16.ffn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.16.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.16.ffn_gate.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.16.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.16.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.16.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.16.ffn_up.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.16.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.16.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.16.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.16.ffn_down.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.16.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.16.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
  util.global private @__auto.blk.17.attn_norm.weight = #stream.parameter.named<"model"::"blk.17.attn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.17.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.17.attn_q.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.17.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.17.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.17.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.17.attn_k.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.17.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.17.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.17.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.17.attn_v.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.17.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.17.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.17.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.17.kv_cache.quantizer:rscale"> : tensor<f32>
  util.global private @"__auto.blk.17.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.17.attn_output.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.17.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.17.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @__auto.blk.17.ffn_norm.weight = #stream.parameter.named<"model"::"blk.17.ffn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.17.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.17.ffn_gate.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.17.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.17.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.17.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.17.ffn_up.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.17.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.17.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.17.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.17.ffn_down.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.17.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.17.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
  util.global private @__auto.blk.18.attn_norm.weight = #stream.parameter.named<"model"::"blk.18.attn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.18.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.18.attn_q.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.18.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.18.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.18.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.18.attn_k.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.18.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.18.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.18.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.18.attn_v.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.18.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.18.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.18.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.18.kv_cache.quantizer:rscale"> : tensor<f32>
  util.global private @"__auto.blk.18.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.18.attn_output.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.18.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.18.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @__auto.blk.18.ffn_norm.weight = #stream.parameter.named<"model"::"blk.18.ffn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.18.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.18.ffn_gate.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.18.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.18.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.18.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.18.ffn_up.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.18.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.18.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.18.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.18.ffn_down.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.18.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.18.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
  util.global private @__auto.blk.19.attn_norm.weight = #stream.parameter.named<"model"::"blk.19.attn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.19.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.19.attn_q.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.19.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.19.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.19.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.19.attn_k.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.19.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.19.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.19.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.19.attn_v.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.19.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.19.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.19.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.19.kv_cache.quantizer:rscale"> : tensor<f32>
  util.global private @"__auto.blk.19.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.19.attn_output.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.19.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.19.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @__auto.blk.19.ffn_norm.weight = #stream.parameter.named<"model"::"blk.19.ffn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.19.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.19.ffn_gate.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.19.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.19.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.19.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.19.ffn_up.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.19.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.19.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.19.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.19.ffn_down.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.19.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.19.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
  util.global private @__auto.blk.20.attn_norm.weight = #stream.parameter.named<"model"::"blk.20.attn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.20.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.20.attn_q.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.20.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.20.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.20.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.20.attn_k.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.20.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.20.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.20.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.20.attn_v.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.20.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.20.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.20.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.20.kv_cache.quantizer:rscale"> : tensor<f32>
  util.global private @"__auto.blk.20.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.20.attn_output.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.20.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.20.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @__auto.blk.20.ffn_norm.weight = #stream.parameter.named<"model"::"blk.20.ffn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.20.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.20.ffn_gate.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.20.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.20.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.20.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.20.ffn_up.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.20.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.20.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.20.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.20.ffn_down.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.20.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.20.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
  util.global private @__auto.blk.21.attn_norm.weight = #stream.parameter.named<"model"::"blk.21.attn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.21.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.21.attn_q.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.21.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.21.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.21.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.21.attn_k.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.21.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.21.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.21.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.21.attn_v.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.21.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.21.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.21.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.21.kv_cache.quantizer:rscale"> : tensor<f32>
  util.global private @"__auto.blk.21.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.21.attn_output.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.21.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.21.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @__auto.blk.21.ffn_norm.weight = #stream.parameter.named<"model"::"blk.21.ffn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.21.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.21.ffn_gate.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.21.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.21.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.21.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.21.ffn_up.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.21.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.21.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.21.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.21.ffn_down.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.21.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.21.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
  util.global private @__auto.blk.22.attn_norm.weight = #stream.parameter.named<"model"::"blk.22.attn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.22.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.22.attn_q.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.22.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.22.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.22.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.22.attn_k.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.22.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.22.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.22.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.22.attn_v.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.22.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.22.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.22.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.22.kv_cache.quantizer:rscale"> : tensor<f32>
  util.global private @"__auto.blk.22.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.22.attn_output.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.22.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.22.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @__auto.blk.22.ffn_norm.weight = #stream.parameter.named<"model"::"blk.22.ffn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.22.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.22.ffn_gate.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.22.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.22.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.22.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.22.ffn_up.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.22.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.22.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.22.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.22.ffn_down.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.22.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.22.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
  util.global private @__auto.blk.23.attn_norm.weight = #stream.parameter.named<"model"::"blk.23.attn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.23.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.23.attn_q.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.23.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.23.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.23.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.23.attn_k.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.23.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.23.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.23.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.23.attn_v.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.23.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.23.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.23.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.23.kv_cache.quantizer:rscale"> : tensor<f32>
  util.global private @"__auto.blk.23.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.23.attn_output.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.23.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.23.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @__auto.blk.23.ffn_norm.weight = #stream.parameter.named<"model"::"blk.23.ffn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.23.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.23.ffn_gate.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.23.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.23.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.23.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.23.ffn_up.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.23.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.23.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.23.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.23.ffn_down.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.23.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.23.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
  util.global private @__auto.blk.24.attn_norm.weight = #stream.parameter.named<"model"::"blk.24.attn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.24.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.24.attn_q.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.24.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.24.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.24.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.24.attn_k.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.24.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.24.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.24.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.24.attn_v.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.24.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.24.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.24.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.24.kv_cache.quantizer:rscale"> : tensor<f32>
  util.global private @"__auto.blk.24.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.24.attn_output.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.24.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.24.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @__auto.blk.24.ffn_norm.weight = #stream.parameter.named<"model"::"blk.24.ffn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.24.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.24.ffn_gate.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.24.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.24.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.24.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.24.ffn_up.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.24.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.24.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.24.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.24.ffn_down.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.24.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.24.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
  util.global private @__auto.blk.25.attn_norm.weight = #stream.parameter.named<"model"::"blk.25.attn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.25.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.25.attn_q.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.25.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.25.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.25.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.25.attn_k.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.25.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.25.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.25.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.25.attn_v.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.25.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.25.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.25.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.25.kv_cache.quantizer:rscale"> : tensor<f32>
  util.global private @"__auto.blk.25.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.25.attn_output.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.25.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.25.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @__auto.blk.25.ffn_norm.weight = #stream.parameter.named<"model"::"blk.25.ffn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.25.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.25.ffn_gate.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.25.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.25.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.25.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.25.ffn_up.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.25.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.25.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.25.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.25.ffn_down.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.25.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.25.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
  util.global private @__auto.blk.26.attn_norm.weight = #stream.parameter.named<"model"::"blk.26.attn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.26.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.26.attn_q.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.26.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.26.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.26.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.26.attn_k.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.26.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.26.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.26.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.26.attn_v.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.26.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.26.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.26.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.26.kv_cache.quantizer:rscale"> : tensor<f32>
  util.global private @"__auto.blk.26.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.26.attn_output.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.26.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.26.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @__auto.blk.26.ffn_norm.weight = #stream.parameter.named<"model"::"blk.26.ffn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.26.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.26.ffn_gate.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.26.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.26.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.26.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.26.ffn_up.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.26.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.26.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.26.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.26.ffn_down.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.26.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.26.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
  util.global private @__auto.blk.27.attn_norm.weight = #stream.parameter.named<"model"::"blk.27.attn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.27.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.27.attn_q.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.27.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.27.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.27.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.27.attn_k.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.27.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.27.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.27.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.27.attn_v.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.27.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.27.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.27.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.27.kv_cache.quantizer:rscale"> : tensor<f32>
  util.global private @"__auto.blk.27.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.27.attn_output.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.27.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.27.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @__auto.blk.27.ffn_norm.weight = #stream.parameter.named<"model"::"blk.27.ffn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.27.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.27.ffn_gate.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.27.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.27.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.27.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.27.ffn_up.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.27.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.27.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.27.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.27.ffn_down.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.27.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.27.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
  util.global private @__auto.blk.28.attn_norm.weight = #stream.parameter.named<"model"::"blk.28.attn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.28.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.28.attn_q.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.28.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.28.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.28.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.28.attn_k.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.28.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.28.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.28.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.28.attn_v.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.28.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.28.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.28.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.28.kv_cache.quantizer:rscale"> : tensor<f32>
  util.global private @"__auto.blk.28.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.28.attn_output.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.28.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.28.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @__auto.blk.28.ffn_norm.weight = #stream.parameter.named<"model"::"blk.28.ffn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.28.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.28.ffn_gate.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.28.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.28.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.28.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.28.ffn_up.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.28.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.28.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.28.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.28.ffn_down.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.28.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.28.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
  util.global private @__auto.blk.29.attn_norm.weight = #stream.parameter.named<"model"::"blk.29.attn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.29.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.29.attn_q.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.29.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.29.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.29.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.29.attn_k.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.29.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.29.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.29.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.29.attn_v.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.29.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.29.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.29.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.29.kv_cache.quantizer:rscale"> : tensor<f32>
  util.global private @"__auto.blk.29.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.29.attn_output.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.29.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.29.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @__auto.blk.29.ffn_norm.weight = #stream.parameter.named<"model"::"blk.29.ffn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.29.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.29.ffn_gate.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.29.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.29.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.29.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.29.ffn_up.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.29.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.29.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.29.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.29.ffn_down.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.29.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.29.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
  util.global private @__auto.blk.30.attn_norm.weight = #stream.parameter.named<"model"::"blk.30.attn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.30.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.30.attn_q.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.30.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.30.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.30.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.30.attn_k.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.30.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.30.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.30.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.30.attn_v.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.30.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.30.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.30.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.30.kv_cache.quantizer:rscale"> : tensor<f32>
  util.global private @"__auto.blk.30.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.30.attn_output.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.30.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.30.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @__auto.blk.30.ffn_norm.weight = #stream.parameter.named<"model"::"blk.30.ffn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.30.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.30.ffn_gate.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.30.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.30.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.30.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.30.ffn_up.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.30.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.30.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.30.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.30.ffn_down.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.30.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.30.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
  util.global private @__auto.blk.31.attn_norm.weight = #stream.parameter.named<"model"::"blk.31.attn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.31.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.31.attn_q.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.31.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.31.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.31.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.31.attn_k.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.31.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.31.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.31.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.31.attn_v.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.31.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.31.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.31.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.31.kv_cache.quantizer:rscale"> : tensor<f32>
  util.global private @"__auto.blk.31.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.31.attn_output.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.31.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.31.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
  util.global private @__auto.blk.31.ffn_norm.weight = #stream.parameter.named<"model"::"blk.31.ffn_norm.weight"> : tensor<4096xbf16>
  util.global private @"__auto.blk.31.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.31.ffn_gate.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.31.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.31.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.31.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.31.ffn_up.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.31.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.31.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
  util.global private @"__auto.blk.31.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.31.ffn_down.q_input:rscale"> : tensor<f32>
  util.global private @"__auto.blk.31.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.31.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
  util.global private @__auto.output_norm.weight = #stream.parameter.named<"model"::"output_norm.weight"> : tensor<4096xbf16>
  util.global private @__auto.output.weight = #stream.parameter.named<"model"::"output.weight"> : tensor<128256x4096xbf16>
  func.func @prefill_bs1(%arg0: !torch.vtensor<[1,?],si64>, %arg1: !torch.vtensor<[1],si64>, %arg2: !torch.vtensor<[1,?],si64>, %arg3: !torch.tensor<[?,2097152],f8E4M3FNUZ>) -> !torch.vtensor<[1,?,128256],bf16> attributes {torch.assume_strict_symbolic_shapes} {
    %__auto.token_embd.weight = util.global.load @__auto.token_embd.weight : tensor<128256x4096xbf16>
    %0 = torch_c.from_builtin_tensor %__auto.token_embd.weight : tensor<128256x4096xbf16> -> !torch.vtensor<[128256,4096],bf16>
    %__auto.blk.0.attn_norm.weight = util.global.load @__auto.blk.0.attn_norm.weight : tensor<4096xbf16>
    %1 = torch_c.from_builtin_tensor %__auto.blk.0.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.0.attn_q.q_input3Arscale = util.global.load @"__auto.blk.0.attn_q.q_input:rscale" : tensor<f32>
    %2 = torch_c.from_builtin_tensor %__auto.blk.0.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.0.attn_q.weight3Aqs = util.global.load @"__auto.blk.0.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %3 = torch_c.from_builtin_tensor %__auto.blk.0.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.0.attn_k.q_input3Arscale = util.global.load @"__auto.blk.0.attn_k.q_input:rscale" : tensor<f32>
    %4 = torch_c.from_builtin_tensor %__auto.blk.0.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.0.attn_k.weight3Aqs = util.global.load @"__auto.blk.0.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %5 = torch_c.from_builtin_tensor %__auto.blk.0.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.0.attn_v.q_input3Arscale = util.global.load @"__auto.blk.0.attn_v.q_input:rscale" : tensor<f32>
    %6 = torch_c.from_builtin_tensor %__auto.blk.0.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.0.attn_v.weight3Aqs = util.global.load @"__auto.blk.0.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %7 = torch_c.from_builtin_tensor %__auto.blk.0.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.0.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.0.kv_cache.quantizer:rscale" : tensor<f32>
    %8 = torch_c.from_builtin_tensor %__auto.blk.0.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.0.attn_output.q_input3Arscale = util.global.load @"__auto.blk.0.attn_output.q_input:rscale" : tensor<f32>
    %9 = torch_c.from_builtin_tensor %__auto.blk.0.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.0.attn_output.weight3Aqs = util.global.load @"__auto.blk.0.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %10 = torch_c.from_builtin_tensor %__auto.blk.0.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.0.ffn_norm.weight = util.global.load @__auto.blk.0.ffn_norm.weight : tensor<4096xbf16>
    %11 = torch_c.from_builtin_tensor %__auto.blk.0.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.0.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.0.ffn_gate.q_input:rscale" : tensor<f32>
    %12 = torch_c.from_builtin_tensor %__auto.blk.0.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.0.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.0.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %13 = torch_c.from_builtin_tensor %__auto.blk.0.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.0.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.0.ffn_up.q_input:rscale" : tensor<f32>
    %14 = torch_c.from_builtin_tensor %__auto.blk.0.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.0.ffn_up.weight3Aqs = util.global.load @"__auto.blk.0.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %15 = torch_c.from_builtin_tensor %__auto.blk.0.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.0.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.0.ffn_down.q_input:rscale" : tensor<f32>
    %16 = torch_c.from_builtin_tensor %__auto.blk.0.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.0.ffn_down.weight3Aqs = util.global.load @"__auto.blk.0.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
    %17 = torch_c.from_builtin_tensor %__auto.blk.0.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %__auto.blk.1.attn_norm.weight = util.global.load @__auto.blk.1.attn_norm.weight : tensor<4096xbf16>
    %18 = torch_c.from_builtin_tensor %__auto.blk.1.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.1.attn_q.q_input3Arscale = util.global.load @"__auto.blk.1.attn_q.q_input:rscale" : tensor<f32>
    %19 = torch_c.from_builtin_tensor %__auto.blk.1.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.1.attn_q.weight3Aqs = util.global.load @"__auto.blk.1.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %20 = torch_c.from_builtin_tensor %__auto.blk.1.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.1.attn_k.q_input3Arscale = util.global.load @"__auto.blk.1.attn_k.q_input:rscale" : tensor<f32>
    %21 = torch_c.from_builtin_tensor %__auto.blk.1.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.1.attn_k.weight3Aqs = util.global.load @"__auto.blk.1.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %22 = torch_c.from_builtin_tensor %__auto.blk.1.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.1.attn_v.q_input3Arscale = util.global.load @"__auto.blk.1.attn_v.q_input:rscale" : tensor<f32>
    %23 = torch_c.from_builtin_tensor %__auto.blk.1.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.1.attn_v.weight3Aqs = util.global.load @"__auto.blk.1.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %24 = torch_c.from_builtin_tensor %__auto.blk.1.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.1.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.1.kv_cache.quantizer:rscale" : tensor<f32>
    %25 = torch_c.from_builtin_tensor %__auto.blk.1.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.1.attn_output.q_input3Arscale = util.global.load @"__auto.blk.1.attn_output.q_input:rscale" : tensor<f32>
    %26 = torch_c.from_builtin_tensor %__auto.blk.1.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.1.attn_output.weight3Aqs = util.global.load @"__auto.blk.1.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %27 = torch_c.from_builtin_tensor %__auto.blk.1.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.1.ffn_norm.weight = util.global.load @__auto.blk.1.ffn_norm.weight : tensor<4096xbf16>
    %28 = torch_c.from_builtin_tensor %__auto.blk.1.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.1.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.1.ffn_gate.q_input:rscale" : tensor<f32>
    %29 = torch_c.from_builtin_tensor %__auto.blk.1.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.1.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.1.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %30 = torch_c.from_builtin_tensor %__auto.blk.1.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.1.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.1.ffn_up.q_input:rscale" : tensor<f32>
    %31 = torch_c.from_builtin_tensor %__auto.blk.1.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.1.ffn_up.weight3Aqs = util.global.load @"__auto.blk.1.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %32 = torch_c.from_builtin_tensor %__auto.blk.1.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.1.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.1.ffn_down.q_input:rscale" : tensor<f32>
    %33 = torch_c.from_builtin_tensor %__auto.blk.1.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.1.ffn_down.weight3Aqs = util.global.load @"__auto.blk.1.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
    %34 = torch_c.from_builtin_tensor %__auto.blk.1.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %__auto.blk.2.attn_norm.weight = util.global.load @__auto.blk.2.attn_norm.weight : tensor<4096xbf16>
    %35 = torch_c.from_builtin_tensor %__auto.blk.2.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.2.attn_q.q_input3Arscale = util.global.load @"__auto.blk.2.attn_q.q_input:rscale" : tensor<f32>
    %36 = torch_c.from_builtin_tensor %__auto.blk.2.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.2.attn_q.weight3Aqs = util.global.load @"__auto.blk.2.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %37 = torch_c.from_builtin_tensor %__auto.blk.2.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.2.attn_k.q_input3Arscale = util.global.load @"__auto.blk.2.attn_k.q_input:rscale" : tensor<f32>
    %38 = torch_c.from_builtin_tensor %__auto.blk.2.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.2.attn_k.weight3Aqs = util.global.load @"__auto.blk.2.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %39 = torch_c.from_builtin_tensor %__auto.blk.2.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.2.attn_v.q_input3Arscale = util.global.load @"__auto.blk.2.attn_v.q_input:rscale" : tensor<f32>
    %40 = torch_c.from_builtin_tensor %__auto.blk.2.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.2.attn_v.weight3Aqs = util.global.load @"__auto.blk.2.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %41 = torch_c.from_builtin_tensor %__auto.blk.2.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.2.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.2.kv_cache.quantizer:rscale" : tensor<f32>
    %42 = torch_c.from_builtin_tensor %__auto.blk.2.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.2.attn_output.q_input3Arscale = util.global.load @"__auto.blk.2.attn_output.q_input:rscale" : tensor<f32>
    %43 = torch_c.from_builtin_tensor %__auto.blk.2.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.2.attn_output.weight3Aqs = util.global.load @"__auto.blk.2.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %44 = torch_c.from_builtin_tensor %__auto.blk.2.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.2.ffn_norm.weight = util.global.load @__auto.blk.2.ffn_norm.weight : tensor<4096xbf16>
    %45 = torch_c.from_builtin_tensor %__auto.blk.2.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.2.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.2.ffn_gate.q_input:rscale" : tensor<f32>
    %46 = torch_c.from_builtin_tensor %__auto.blk.2.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.2.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.2.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %47 = torch_c.from_builtin_tensor %__auto.blk.2.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.2.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.2.ffn_up.q_input:rscale" : tensor<f32>
    %48 = torch_c.from_builtin_tensor %__auto.blk.2.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.2.ffn_up.weight3Aqs = util.global.load @"__auto.blk.2.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %49 = torch_c.from_builtin_tensor %__auto.blk.2.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.2.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.2.ffn_down.q_input:rscale" : tensor<f32>
    %50 = torch_c.from_builtin_tensor %__auto.blk.2.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.2.ffn_down.weight3Aqs = util.global.load @"__auto.blk.2.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
    %51 = torch_c.from_builtin_tensor %__auto.blk.2.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %__auto.blk.3.attn_norm.weight = util.global.load @__auto.blk.3.attn_norm.weight : tensor<4096xbf16>
    %52 = torch_c.from_builtin_tensor %__auto.blk.3.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.3.attn_q.q_input3Arscale = util.global.load @"__auto.blk.3.attn_q.q_input:rscale" : tensor<f32>
    %53 = torch_c.from_builtin_tensor %__auto.blk.3.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.3.attn_q.weight3Aqs = util.global.load @"__auto.blk.3.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %54 = torch_c.from_builtin_tensor %__auto.blk.3.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.3.attn_k.q_input3Arscale = util.global.load @"__auto.blk.3.attn_k.q_input:rscale" : tensor<f32>
    %55 = torch_c.from_builtin_tensor %__auto.blk.3.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.3.attn_k.weight3Aqs = util.global.load @"__auto.blk.3.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %56 = torch_c.from_builtin_tensor %__auto.blk.3.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.3.attn_v.q_input3Arscale = util.global.load @"__auto.blk.3.attn_v.q_input:rscale" : tensor<f32>
    %57 = torch_c.from_builtin_tensor %__auto.blk.3.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.3.attn_v.weight3Aqs = util.global.load @"__auto.blk.3.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %58 = torch_c.from_builtin_tensor %__auto.blk.3.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.3.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.3.kv_cache.quantizer:rscale" : tensor<f32>
    %59 = torch_c.from_builtin_tensor %__auto.blk.3.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.3.attn_output.q_input3Arscale = util.global.load @"__auto.blk.3.attn_output.q_input:rscale" : tensor<f32>
    %60 = torch_c.from_builtin_tensor %__auto.blk.3.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.3.attn_output.weight3Aqs = util.global.load @"__auto.blk.3.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %61 = torch_c.from_builtin_tensor %__auto.blk.3.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.3.ffn_norm.weight = util.global.load @__auto.blk.3.ffn_norm.weight : tensor<4096xbf16>
    %62 = torch_c.from_builtin_tensor %__auto.blk.3.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.3.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.3.ffn_gate.q_input:rscale" : tensor<f32>
    %63 = torch_c.from_builtin_tensor %__auto.blk.3.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.3.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.3.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %64 = torch_c.from_builtin_tensor %__auto.blk.3.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.3.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.3.ffn_up.q_input:rscale" : tensor<f32>
    %65 = torch_c.from_builtin_tensor %__auto.blk.3.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.3.ffn_up.weight3Aqs = util.global.load @"__auto.blk.3.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %66 = torch_c.from_builtin_tensor %__auto.blk.3.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.3.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.3.ffn_down.q_input:rscale" : tensor<f32>
    %67 = torch_c.from_builtin_tensor %__auto.blk.3.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.3.ffn_down.weight3Aqs = util.global.load @"__auto.blk.3.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
    %68 = torch_c.from_builtin_tensor %__auto.blk.3.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %__auto.blk.4.attn_norm.weight = util.global.load @__auto.blk.4.attn_norm.weight : tensor<4096xbf16>
    %69 = torch_c.from_builtin_tensor %__auto.blk.4.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.4.attn_q.q_input3Arscale = util.global.load @"__auto.blk.4.attn_q.q_input:rscale" : tensor<f32>
    %70 = torch_c.from_builtin_tensor %__auto.blk.4.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.4.attn_q.weight3Aqs = util.global.load @"__auto.blk.4.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %71 = torch_c.from_builtin_tensor %__auto.blk.4.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.4.attn_k.q_input3Arscale = util.global.load @"__auto.blk.4.attn_k.q_input:rscale" : tensor<f32>
    %72 = torch_c.from_builtin_tensor %__auto.blk.4.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.4.attn_k.weight3Aqs = util.global.load @"__auto.blk.4.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %73 = torch_c.from_builtin_tensor %__auto.blk.4.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.4.attn_v.q_input3Arscale = util.global.load @"__auto.blk.4.attn_v.q_input:rscale" : tensor<f32>
    %74 = torch_c.from_builtin_tensor %__auto.blk.4.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.4.attn_v.weight3Aqs = util.global.load @"__auto.blk.4.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %75 = torch_c.from_builtin_tensor %__auto.blk.4.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.4.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.4.kv_cache.quantizer:rscale" : tensor<f32>
    %76 = torch_c.from_builtin_tensor %__auto.blk.4.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.4.attn_output.q_input3Arscale = util.global.load @"__auto.blk.4.attn_output.q_input:rscale" : tensor<f32>
    %77 = torch_c.from_builtin_tensor %__auto.blk.4.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.4.attn_output.weight3Aqs = util.global.load @"__auto.blk.4.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %78 = torch_c.from_builtin_tensor %__auto.blk.4.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.4.ffn_norm.weight = util.global.load @__auto.blk.4.ffn_norm.weight : tensor<4096xbf16>
    %79 = torch_c.from_builtin_tensor %__auto.blk.4.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.4.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.4.ffn_gate.q_input:rscale" : tensor<f32>
    %80 = torch_c.from_builtin_tensor %__auto.blk.4.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.4.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.4.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %81 = torch_c.from_builtin_tensor %__auto.blk.4.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.4.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.4.ffn_up.q_input:rscale" : tensor<f32>
    %82 = torch_c.from_builtin_tensor %__auto.blk.4.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.4.ffn_up.weight3Aqs = util.global.load @"__auto.blk.4.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %83 = torch_c.from_builtin_tensor %__auto.blk.4.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.4.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.4.ffn_down.q_input:rscale" : tensor<f32>
    %84 = torch_c.from_builtin_tensor %__auto.blk.4.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.4.ffn_down.weight3Aqs = util.global.load @"__auto.blk.4.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
    %85 = torch_c.from_builtin_tensor %__auto.blk.4.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %__auto.blk.5.attn_norm.weight = util.global.load @__auto.blk.5.attn_norm.weight : tensor<4096xbf16>
    %86 = torch_c.from_builtin_tensor %__auto.blk.5.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.5.attn_q.q_input3Arscale = util.global.load @"__auto.blk.5.attn_q.q_input:rscale" : tensor<f32>
    %87 = torch_c.from_builtin_tensor %__auto.blk.5.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.5.attn_q.weight3Aqs = util.global.load @"__auto.blk.5.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %88 = torch_c.from_builtin_tensor %__auto.blk.5.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.5.attn_k.q_input3Arscale = util.global.load @"__auto.blk.5.attn_k.q_input:rscale" : tensor<f32>
    %89 = torch_c.from_builtin_tensor %__auto.blk.5.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.5.attn_k.weight3Aqs = util.global.load @"__auto.blk.5.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %90 = torch_c.from_builtin_tensor %__auto.blk.5.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.5.attn_v.q_input3Arscale = util.global.load @"__auto.blk.5.attn_v.q_input:rscale" : tensor<f32>
    %91 = torch_c.from_builtin_tensor %__auto.blk.5.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.5.attn_v.weight3Aqs = util.global.load @"__auto.blk.5.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %92 = torch_c.from_builtin_tensor %__auto.blk.5.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.5.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.5.kv_cache.quantizer:rscale" : tensor<f32>
    %93 = torch_c.from_builtin_tensor %__auto.blk.5.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.5.attn_output.q_input3Arscale = util.global.load @"__auto.blk.5.attn_output.q_input:rscale" : tensor<f32>
    %94 = torch_c.from_builtin_tensor %__auto.blk.5.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.5.attn_output.weight3Aqs = util.global.load @"__auto.blk.5.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %95 = torch_c.from_builtin_tensor %__auto.blk.5.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.5.ffn_norm.weight = util.global.load @__auto.blk.5.ffn_norm.weight : tensor<4096xbf16>
    %96 = torch_c.from_builtin_tensor %__auto.blk.5.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.5.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.5.ffn_gate.q_input:rscale" : tensor<f32>
    %97 = torch_c.from_builtin_tensor %__auto.blk.5.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.5.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.5.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %98 = torch_c.from_builtin_tensor %__auto.blk.5.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.5.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.5.ffn_up.q_input:rscale" : tensor<f32>
    %99 = torch_c.from_builtin_tensor %__auto.blk.5.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.5.ffn_up.weight3Aqs = util.global.load @"__auto.blk.5.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %100 = torch_c.from_builtin_tensor %__auto.blk.5.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.5.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.5.ffn_down.q_input:rscale" : tensor<f32>
    %101 = torch_c.from_builtin_tensor %__auto.blk.5.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.5.ffn_down.weight3Aqs = util.global.load @"__auto.blk.5.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
    %102 = torch_c.from_builtin_tensor %__auto.blk.5.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %__auto.blk.6.attn_norm.weight = util.global.load @__auto.blk.6.attn_norm.weight : tensor<4096xbf16>
    %103 = torch_c.from_builtin_tensor %__auto.blk.6.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.6.attn_q.q_input3Arscale = util.global.load @"__auto.blk.6.attn_q.q_input:rscale" : tensor<f32>
    %104 = torch_c.from_builtin_tensor %__auto.blk.6.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.6.attn_q.weight3Aqs = util.global.load @"__auto.blk.6.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %105 = torch_c.from_builtin_tensor %__auto.blk.6.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.6.attn_k.q_input3Arscale = util.global.load @"__auto.blk.6.attn_k.q_input:rscale" : tensor<f32>
    %106 = torch_c.from_builtin_tensor %__auto.blk.6.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.6.attn_k.weight3Aqs = util.global.load @"__auto.blk.6.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %107 = torch_c.from_builtin_tensor %__auto.blk.6.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.6.attn_v.q_input3Arscale = util.global.load @"__auto.blk.6.attn_v.q_input:rscale" : tensor<f32>
    %108 = torch_c.from_builtin_tensor %__auto.blk.6.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.6.attn_v.weight3Aqs = util.global.load @"__auto.blk.6.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %109 = torch_c.from_builtin_tensor %__auto.blk.6.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.6.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.6.kv_cache.quantizer:rscale" : tensor<f32>
    %110 = torch_c.from_builtin_tensor %__auto.blk.6.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.6.attn_output.q_input3Arscale = util.global.load @"__auto.blk.6.attn_output.q_input:rscale" : tensor<f32>
    %111 = torch_c.from_builtin_tensor %__auto.blk.6.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.6.attn_output.weight3Aqs = util.global.load @"__auto.blk.6.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %112 = torch_c.from_builtin_tensor %__auto.blk.6.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.6.ffn_norm.weight = util.global.load @__auto.blk.6.ffn_norm.weight : tensor<4096xbf16>
    %113 = torch_c.from_builtin_tensor %__auto.blk.6.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.6.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.6.ffn_gate.q_input:rscale" : tensor<f32>
    %114 = torch_c.from_builtin_tensor %__auto.blk.6.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.6.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.6.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %115 = torch_c.from_builtin_tensor %__auto.blk.6.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.6.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.6.ffn_up.q_input:rscale" : tensor<f32>
    %116 = torch_c.from_builtin_tensor %__auto.blk.6.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.6.ffn_up.weight3Aqs = util.global.load @"__auto.blk.6.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %117 = torch_c.from_builtin_tensor %__auto.blk.6.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.6.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.6.ffn_down.q_input:rscale" : tensor<f32>
    %118 = torch_c.from_builtin_tensor %__auto.blk.6.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.6.ffn_down.weight3Aqs = util.global.load @"__auto.blk.6.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
    %119 = torch_c.from_builtin_tensor %__auto.blk.6.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %__auto.blk.7.attn_norm.weight = util.global.load @__auto.blk.7.attn_norm.weight : tensor<4096xbf16>
    %120 = torch_c.from_builtin_tensor %__auto.blk.7.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.7.attn_q.q_input3Arscale = util.global.load @"__auto.blk.7.attn_q.q_input:rscale" : tensor<f32>
    %121 = torch_c.from_builtin_tensor %__auto.blk.7.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.7.attn_q.weight3Aqs = util.global.load @"__auto.blk.7.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %122 = torch_c.from_builtin_tensor %__auto.blk.7.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.7.attn_k.q_input3Arscale = util.global.load @"__auto.blk.7.attn_k.q_input:rscale" : tensor<f32>
    %123 = torch_c.from_builtin_tensor %__auto.blk.7.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.7.attn_k.weight3Aqs = util.global.load @"__auto.blk.7.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %124 = torch_c.from_builtin_tensor %__auto.blk.7.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.7.attn_v.q_input3Arscale = util.global.load @"__auto.blk.7.attn_v.q_input:rscale" : tensor<f32>
    %125 = torch_c.from_builtin_tensor %__auto.blk.7.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.7.attn_v.weight3Aqs = util.global.load @"__auto.blk.7.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %126 = torch_c.from_builtin_tensor %__auto.blk.7.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.7.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.7.kv_cache.quantizer:rscale" : tensor<f32>
    %127 = torch_c.from_builtin_tensor %__auto.blk.7.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.7.attn_output.q_input3Arscale = util.global.load @"__auto.blk.7.attn_output.q_input:rscale" : tensor<f32>
    %128 = torch_c.from_builtin_tensor %__auto.blk.7.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.7.attn_output.weight3Aqs = util.global.load @"__auto.blk.7.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %129 = torch_c.from_builtin_tensor %__auto.blk.7.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.7.ffn_norm.weight = util.global.load @__auto.blk.7.ffn_norm.weight : tensor<4096xbf16>
    %130 = torch_c.from_builtin_tensor %__auto.blk.7.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.7.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.7.ffn_gate.q_input:rscale" : tensor<f32>
    %131 = torch_c.from_builtin_tensor %__auto.blk.7.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.7.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.7.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %132 = torch_c.from_builtin_tensor %__auto.blk.7.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.7.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.7.ffn_up.q_input:rscale" : tensor<f32>
    %133 = torch_c.from_builtin_tensor %__auto.blk.7.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.7.ffn_up.weight3Aqs = util.global.load @"__auto.blk.7.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %134 = torch_c.from_builtin_tensor %__auto.blk.7.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.7.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.7.ffn_down.q_input:rscale" : tensor<f32>
    %135 = torch_c.from_builtin_tensor %__auto.blk.7.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.7.ffn_down.weight3Aqs = util.global.load @"__auto.blk.7.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
    %136 = torch_c.from_builtin_tensor %__auto.blk.7.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %__auto.blk.8.attn_norm.weight = util.global.load @__auto.blk.8.attn_norm.weight : tensor<4096xbf16>
    %137 = torch_c.from_builtin_tensor %__auto.blk.8.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.8.attn_q.q_input3Arscale = util.global.load @"__auto.blk.8.attn_q.q_input:rscale" : tensor<f32>
    %138 = torch_c.from_builtin_tensor %__auto.blk.8.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.8.attn_q.weight3Aqs = util.global.load @"__auto.blk.8.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %139 = torch_c.from_builtin_tensor %__auto.blk.8.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.8.attn_k.q_input3Arscale = util.global.load @"__auto.blk.8.attn_k.q_input:rscale" : tensor<f32>
    %140 = torch_c.from_builtin_tensor %__auto.blk.8.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.8.attn_k.weight3Aqs = util.global.load @"__auto.blk.8.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %141 = torch_c.from_builtin_tensor %__auto.blk.8.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.8.attn_v.q_input3Arscale = util.global.load @"__auto.blk.8.attn_v.q_input:rscale" : tensor<f32>
    %142 = torch_c.from_builtin_tensor %__auto.blk.8.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.8.attn_v.weight3Aqs = util.global.load @"__auto.blk.8.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %143 = torch_c.from_builtin_tensor %__auto.blk.8.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.8.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.8.kv_cache.quantizer:rscale" : tensor<f32>
    %144 = torch_c.from_builtin_tensor %__auto.blk.8.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.8.attn_output.q_input3Arscale = util.global.load @"__auto.blk.8.attn_output.q_input:rscale" : tensor<f32>
    %145 = torch_c.from_builtin_tensor %__auto.blk.8.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.8.attn_output.weight3Aqs = util.global.load @"__auto.blk.8.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %146 = torch_c.from_builtin_tensor %__auto.blk.8.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.8.ffn_norm.weight = util.global.load @__auto.blk.8.ffn_norm.weight : tensor<4096xbf16>
    %147 = torch_c.from_builtin_tensor %__auto.blk.8.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.8.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.8.ffn_gate.q_input:rscale" : tensor<f32>
    %148 = torch_c.from_builtin_tensor %__auto.blk.8.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.8.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.8.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %149 = torch_c.from_builtin_tensor %__auto.blk.8.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.8.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.8.ffn_up.q_input:rscale" : tensor<f32>
    %150 = torch_c.from_builtin_tensor %__auto.blk.8.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.8.ffn_up.weight3Aqs = util.global.load @"__auto.blk.8.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %151 = torch_c.from_builtin_tensor %__auto.blk.8.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.8.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.8.ffn_down.q_input:rscale" : tensor<f32>
    %152 = torch_c.from_builtin_tensor %__auto.blk.8.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.8.ffn_down.weight3Aqs = util.global.load @"__auto.blk.8.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
    %153 = torch_c.from_builtin_tensor %__auto.blk.8.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %__auto.blk.9.attn_norm.weight = util.global.load @__auto.blk.9.attn_norm.weight : tensor<4096xbf16>
    %154 = torch_c.from_builtin_tensor %__auto.blk.9.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.9.attn_q.q_input3Arscale = util.global.load @"__auto.blk.9.attn_q.q_input:rscale" : tensor<f32>
    %155 = torch_c.from_builtin_tensor %__auto.blk.9.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.9.attn_q.weight3Aqs = util.global.load @"__auto.blk.9.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %156 = torch_c.from_builtin_tensor %__auto.blk.9.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.9.attn_k.q_input3Arscale = util.global.load @"__auto.blk.9.attn_k.q_input:rscale" : tensor<f32>
    %157 = torch_c.from_builtin_tensor %__auto.blk.9.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.9.attn_k.weight3Aqs = util.global.load @"__auto.blk.9.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %158 = torch_c.from_builtin_tensor %__auto.blk.9.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.9.attn_v.q_input3Arscale = util.global.load @"__auto.blk.9.attn_v.q_input:rscale" : tensor<f32>
    %159 = torch_c.from_builtin_tensor %__auto.blk.9.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.9.attn_v.weight3Aqs = util.global.load @"__auto.blk.9.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %160 = torch_c.from_builtin_tensor %__auto.blk.9.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.9.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.9.kv_cache.quantizer:rscale" : tensor<f32>
    %161 = torch_c.from_builtin_tensor %__auto.blk.9.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.9.attn_output.q_input3Arscale = util.global.load @"__auto.blk.9.attn_output.q_input:rscale" : tensor<f32>
    %162 = torch_c.from_builtin_tensor %__auto.blk.9.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.9.attn_output.weight3Aqs = util.global.load @"__auto.blk.9.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %163 = torch_c.from_builtin_tensor %__auto.blk.9.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.9.ffn_norm.weight = util.global.load @__auto.blk.9.ffn_norm.weight : tensor<4096xbf16>
    %164 = torch_c.from_builtin_tensor %__auto.blk.9.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.9.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.9.ffn_gate.q_input:rscale" : tensor<f32>
    %165 = torch_c.from_builtin_tensor %__auto.blk.9.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.9.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.9.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %166 = torch_c.from_builtin_tensor %__auto.blk.9.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.9.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.9.ffn_up.q_input:rscale" : tensor<f32>
    %167 = torch_c.from_builtin_tensor %__auto.blk.9.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.9.ffn_up.weight3Aqs = util.global.load @"__auto.blk.9.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %168 = torch_c.from_builtin_tensor %__auto.blk.9.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.9.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.9.ffn_down.q_input:rscale" : tensor<f32>
    %169 = torch_c.from_builtin_tensor %__auto.blk.9.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.9.ffn_down.weight3Aqs = util.global.load @"__auto.blk.9.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
    %170 = torch_c.from_builtin_tensor %__auto.blk.9.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %__auto.blk.10.attn_norm.weight = util.global.load @__auto.blk.10.attn_norm.weight : tensor<4096xbf16>
    %171 = torch_c.from_builtin_tensor %__auto.blk.10.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.10.attn_q.q_input3Arscale = util.global.load @"__auto.blk.10.attn_q.q_input:rscale" : tensor<f32>
    %172 = torch_c.from_builtin_tensor %__auto.blk.10.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.10.attn_q.weight3Aqs = util.global.load @"__auto.blk.10.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %173 = torch_c.from_builtin_tensor %__auto.blk.10.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.10.attn_k.q_input3Arscale = util.global.load @"__auto.blk.10.attn_k.q_input:rscale" : tensor<f32>
    %174 = torch_c.from_builtin_tensor %__auto.blk.10.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.10.attn_k.weight3Aqs = util.global.load @"__auto.blk.10.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %175 = torch_c.from_builtin_tensor %__auto.blk.10.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.10.attn_v.q_input3Arscale = util.global.load @"__auto.blk.10.attn_v.q_input:rscale" : tensor<f32>
    %176 = torch_c.from_builtin_tensor %__auto.blk.10.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.10.attn_v.weight3Aqs = util.global.load @"__auto.blk.10.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %177 = torch_c.from_builtin_tensor %__auto.blk.10.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.10.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.10.kv_cache.quantizer:rscale" : tensor<f32>
    %178 = torch_c.from_builtin_tensor %__auto.blk.10.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.10.attn_output.q_input3Arscale = util.global.load @"__auto.blk.10.attn_output.q_input:rscale" : tensor<f32>
    %179 = torch_c.from_builtin_tensor %__auto.blk.10.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.10.attn_output.weight3Aqs = util.global.load @"__auto.blk.10.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %180 = torch_c.from_builtin_tensor %__auto.blk.10.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.10.ffn_norm.weight = util.global.load @__auto.blk.10.ffn_norm.weight : tensor<4096xbf16>
    %181 = torch_c.from_builtin_tensor %__auto.blk.10.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.10.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.10.ffn_gate.q_input:rscale" : tensor<f32>
    %182 = torch_c.from_builtin_tensor %__auto.blk.10.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.10.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.10.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %183 = torch_c.from_builtin_tensor %__auto.blk.10.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.10.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.10.ffn_up.q_input:rscale" : tensor<f32>
    %184 = torch_c.from_builtin_tensor %__auto.blk.10.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.10.ffn_up.weight3Aqs = util.global.load @"__auto.blk.10.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %185 = torch_c.from_builtin_tensor %__auto.blk.10.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.10.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.10.ffn_down.q_input:rscale" : tensor<f32>
    %186 = torch_c.from_builtin_tensor %__auto.blk.10.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.10.ffn_down.weight3Aqs = util.global.load @"__auto.blk.10.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
    %187 = torch_c.from_builtin_tensor %__auto.blk.10.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %__auto.blk.11.attn_norm.weight = util.global.load @__auto.blk.11.attn_norm.weight : tensor<4096xbf16>
    %188 = torch_c.from_builtin_tensor %__auto.blk.11.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.11.attn_q.q_input3Arscale = util.global.load @"__auto.blk.11.attn_q.q_input:rscale" : tensor<f32>
    %189 = torch_c.from_builtin_tensor %__auto.blk.11.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.11.attn_q.weight3Aqs = util.global.load @"__auto.blk.11.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %190 = torch_c.from_builtin_tensor %__auto.blk.11.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.11.attn_k.q_input3Arscale = util.global.load @"__auto.blk.11.attn_k.q_input:rscale" : tensor<f32>
    %191 = torch_c.from_builtin_tensor %__auto.blk.11.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.11.attn_k.weight3Aqs = util.global.load @"__auto.blk.11.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %192 = torch_c.from_builtin_tensor %__auto.blk.11.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.11.attn_v.q_input3Arscale = util.global.load @"__auto.blk.11.attn_v.q_input:rscale" : tensor<f32>
    %193 = torch_c.from_builtin_tensor %__auto.blk.11.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.11.attn_v.weight3Aqs = util.global.load @"__auto.blk.11.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %194 = torch_c.from_builtin_tensor %__auto.blk.11.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.11.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.11.kv_cache.quantizer:rscale" : tensor<f32>
    %195 = torch_c.from_builtin_tensor %__auto.blk.11.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.11.attn_output.q_input3Arscale = util.global.load @"__auto.blk.11.attn_output.q_input:rscale" : tensor<f32>
    %196 = torch_c.from_builtin_tensor %__auto.blk.11.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.11.attn_output.weight3Aqs = util.global.load @"__auto.blk.11.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %197 = torch_c.from_builtin_tensor %__auto.blk.11.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.11.ffn_norm.weight = util.global.load @__auto.blk.11.ffn_norm.weight : tensor<4096xbf16>
    %198 = torch_c.from_builtin_tensor %__auto.blk.11.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.11.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.11.ffn_gate.q_input:rscale" : tensor<f32>
    %199 = torch_c.from_builtin_tensor %__auto.blk.11.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.11.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.11.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %200 = torch_c.from_builtin_tensor %__auto.blk.11.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.11.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.11.ffn_up.q_input:rscale" : tensor<f32>
    %201 = torch_c.from_builtin_tensor %__auto.blk.11.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.11.ffn_up.weight3Aqs = util.global.load @"__auto.blk.11.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %202 = torch_c.from_builtin_tensor %__auto.blk.11.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.11.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.11.ffn_down.q_input:rscale" : tensor<f32>
    %203 = torch_c.from_builtin_tensor %__auto.blk.11.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.11.ffn_down.weight3Aqs = util.global.load @"__auto.blk.11.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
    %204 = torch_c.from_builtin_tensor %__auto.blk.11.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %__auto.blk.12.attn_norm.weight = util.global.load @__auto.blk.12.attn_norm.weight : tensor<4096xbf16>
    %205 = torch_c.from_builtin_tensor %__auto.blk.12.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.12.attn_q.q_input3Arscale = util.global.load @"__auto.blk.12.attn_q.q_input:rscale" : tensor<f32>
    %206 = torch_c.from_builtin_tensor %__auto.blk.12.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.12.attn_q.weight3Aqs = util.global.load @"__auto.blk.12.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %207 = torch_c.from_builtin_tensor %__auto.blk.12.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.12.attn_k.q_input3Arscale = util.global.load @"__auto.blk.12.attn_k.q_input:rscale" : tensor<f32>
    %208 = torch_c.from_builtin_tensor %__auto.blk.12.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.12.attn_k.weight3Aqs = util.global.load @"__auto.blk.12.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %209 = torch_c.from_builtin_tensor %__auto.blk.12.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.12.attn_v.q_input3Arscale = util.global.load @"__auto.blk.12.attn_v.q_input:rscale" : tensor<f32>
    %210 = torch_c.from_builtin_tensor %__auto.blk.12.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.12.attn_v.weight3Aqs = util.global.load @"__auto.blk.12.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %211 = torch_c.from_builtin_tensor %__auto.blk.12.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.12.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.12.kv_cache.quantizer:rscale" : tensor<f32>
    %212 = torch_c.from_builtin_tensor %__auto.blk.12.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.12.attn_output.q_input3Arscale = util.global.load @"__auto.blk.12.attn_output.q_input:rscale" : tensor<f32>
    %213 = torch_c.from_builtin_tensor %__auto.blk.12.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.12.attn_output.weight3Aqs = util.global.load @"__auto.blk.12.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %214 = torch_c.from_builtin_tensor %__auto.blk.12.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.12.ffn_norm.weight = util.global.load @__auto.blk.12.ffn_norm.weight : tensor<4096xbf16>
    %215 = torch_c.from_builtin_tensor %__auto.blk.12.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.12.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.12.ffn_gate.q_input:rscale" : tensor<f32>
    %216 = torch_c.from_builtin_tensor %__auto.blk.12.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.12.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.12.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %217 = torch_c.from_builtin_tensor %__auto.blk.12.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.12.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.12.ffn_up.q_input:rscale" : tensor<f32>
    %218 = torch_c.from_builtin_tensor %__auto.blk.12.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.12.ffn_up.weight3Aqs = util.global.load @"__auto.blk.12.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %219 = torch_c.from_builtin_tensor %__auto.blk.12.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.12.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.12.ffn_down.q_input:rscale" : tensor<f32>
    %220 = torch_c.from_builtin_tensor %__auto.blk.12.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.12.ffn_down.weight3Aqs = util.global.load @"__auto.blk.12.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
    %221 = torch_c.from_builtin_tensor %__auto.blk.12.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %__auto.blk.13.attn_norm.weight = util.global.load @__auto.blk.13.attn_norm.weight : tensor<4096xbf16>
    %222 = torch_c.from_builtin_tensor %__auto.blk.13.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.13.attn_q.q_input3Arscale = util.global.load @"__auto.blk.13.attn_q.q_input:rscale" : tensor<f32>
    %223 = torch_c.from_builtin_tensor %__auto.blk.13.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.13.attn_q.weight3Aqs = util.global.load @"__auto.blk.13.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %224 = torch_c.from_builtin_tensor %__auto.blk.13.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.13.attn_k.q_input3Arscale = util.global.load @"__auto.blk.13.attn_k.q_input:rscale" : tensor<f32>
    %225 = torch_c.from_builtin_tensor %__auto.blk.13.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.13.attn_k.weight3Aqs = util.global.load @"__auto.blk.13.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %226 = torch_c.from_builtin_tensor %__auto.blk.13.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.13.attn_v.q_input3Arscale = util.global.load @"__auto.blk.13.attn_v.q_input:rscale" : tensor<f32>
    %227 = torch_c.from_builtin_tensor %__auto.blk.13.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.13.attn_v.weight3Aqs = util.global.load @"__auto.blk.13.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %228 = torch_c.from_builtin_tensor %__auto.blk.13.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.13.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.13.kv_cache.quantizer:rscale" : tensor<f32>
    %229 = torch_c.from_builtin_tensor %__auto.blk.13.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.13.attn_output.q_input3Arscale = util.global.load @"__auto.blk.13.attn_output.q_input:rscale" : tensor<f32>
    %230 = torch_c.from_builtin_tensor %__auto.blk.13.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.13.attn_output.weight3Aqs = util.global.load @"__auto.blk.13.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %231 = torch_c.from_builtin_tensor %__auto.blk.13.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.13.ffn_norm.weight = util.global.load @__auto.blk.13.ffn_norm.weight : tensor<4096xbf16>
    %232 = torch_c.from_builtin_tensor %__auto.blk.13.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.13.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.13.ffn_gate.q_input:rscale" : tensor<f32>
    %233 = torch_c.from_builtin_tensor %__auto.blk.13.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.13.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.13.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %234 = torch_c.from_builtin_tensor %__auto.blk.13.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.13.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.13.ffn_up.q_input:rscale" : tensor<f32>
    %235 = torch_c.from_builtin_tensor %__auto.blk.13.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.13.ffn_up.weight3Aqs = util.global.load @"__auto.blk.13.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %236 = torch_c.from_builtin_tensor %__auto.blk.13.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.13.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.13.ffn_down.q_input:rscale" : tensor<f32>
    %237 = torch_c.from_builtin_tensor %__auto.blk.13.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.13.ffn_down.weight3Aqs = util.global.load @"__auto.blk.13.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
    %238 = torch_c.from_builtin_tensor %__auto.blk.13.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %__auto.blk.14.attn_norm.weight = util.global.load @__auto.blk.14.attn_norm.weight : tensor<4096xbf16>
    %239 = torch_c.from_builtin_tensor %__auto.blk.14.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.14.attn_q.q_input3Arscale = util.global.load @"__auto.blk.14.attn_q.q_input:rscale" : tensor<f32>
    %240 = torch_c.from_builtin_tensor %__auto.blk.14.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.14.attn_q.weight3Aqs = util.global.load @"__auto.blk.14.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %241 = torch_c.from_builtin_tensor %__auto.blk.14.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.14.attn_k.q_input3Arscale = util.global.load @"__auto.blk.14.attn_k.q_input:rscale" : tensor<f32>
    %242 = torch_c.from_builtin_tensor %__auto.blk.14.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.14.attn_k.weight3Aqs = util.global.load @"__auto.blk.14.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %243 = torch_c.from_builtin_tensor %__auto.blk.14.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.14.attn_v.q_input3Arscale = util.global.load @"__auto.blk.14.attn_v.q_input:rscale" : tensor<f32>
    %244 = torch_c.from_builtin_tensor %__auto.blk.14.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.14.attn_v.weight3Aqs = util.global.load @"__auto.blk.14.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %245 = torch_c.from_builtin_tensor %__auto.blk.14.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.14.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.14.kv_cache.quantizer:rscale" : tensor<f32>
    %246 = torch_c.from_builtin_tensor %__auto.blk.14.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.14.attn_output.q_input3Arscale = util.global.load @"__auto.blk.14.attn_output.q_input:rscale" : tensor<f32>
    %247 = torch_c.from_builtin_tensor %__auto.blk.14.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.14.attn_output.weight3Aqs = util.global.load @"__auto.blk.14.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %248 = torch_c.from_builtin_tensor %__auto.blk.14.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.14.ffn_norm.weight = util.global.load @__auto.blk.14.ffn_norm.weight : tensor<4096xbf16>
    %249 = torch_c.from_builtin_tensor %__auto.blk.14.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.14.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.14.ffn_gate.q_input:rscale" : tensor<f32>
    %250 = torch_c.from_builtin_tensor %__auto.blk.14.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.14.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.14.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %251 = torch_c.from_builtin_tensor %__auto.blk.14.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.14.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.14.ffn_up.q_input:rscale" : tensor<f32>
    %252 = torch_c.from_builtin_tensor %__auto.blk.14.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.14.ffn_up.weight3Aqs = util.global.load @"__auto.blk.14.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %253 = torch_c.from_builtin_tensor %__auto.blk.14.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.14.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.14.ffn_down.q_input:rscale" : tensor<f32>
    %254 = torch_c.from_builtin_tensor %__auto.blk.14.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.14.ffn_down.weight3Aqs = util.global.load @"__auto.blk.14.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
    %255 = torch_c.from_builtin_tensor %__auto.blk.14.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %__auto.blk.15.attn_norm.weight = util.global.load @__auto.blk.15.attn_norm.weight : tensor<4096xbf16>
    %256 = torch_c.from_builtin_tensor %__auto.blk.15.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.15.attn_q.q_input3Arscale = util.global.load @"__auto.blk.15.attn_q.q_input:rscale" : tensor<f32>
    %257 = torch_c.from_builtin_tensor %__auto.blk.15.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.15.attn_q.weight3Aqs = util.global.load @"__auto.blk.15.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %258 = torch_c.from_builtin_tensor %__auto.blk.15.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.15.attn_k.q_input3Arscale = util.global.load @"__auto.blk.15.attn_k.q_input:rscale" : tensor<f32>
    %259 = torch_c.from_builtin_tensor %__auto.blk.15.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.15.attn_k.weight3Aqs = util.global.load @"__auto.blk.15.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %260 = torch_c.from_builtin_tensor %__auto.blk.15.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.15.attn_v.q_input3Arscale = util.global.load @"__auto.blk.15.attn_v.q_input:rscale" : tensor<f32>
    %261 = torch_c.from_builtin_tensor %__auto.blk.15.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.15.attn_v.weight3Aqs = util.global.load @"__auto.blk.15.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %262 = torch_c.from_builtin_tensor %__auto.blk.15.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.15.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.15.kv_cache.quantizer:rscale" : tensor<f32>
    %263 = torch_c.from_builtin_tensor %__auto.blk.15.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.15.attn_output.q_input3Arscale = util.global.load @"__auto.blk.15.attn_output.q_input:rscale" : tensor<f32>
    %264 = torch_c.from_builtin_tensor %__auto.blk.15.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.15.attn_output.weight3Aqs = util.global.load @"__auto.blk.15.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %265 = torch_c.from_builtin_tensor %__auto.blk.15.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.15.ffn_norm.weight = util.global.load @__auto.blk.15.ffn_norm.weight : tensor<4096xbf16>
    %266 = torch_c.from_builtin_tensor %__auto.blk.15.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.15.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.15.ffn_gate.q_input:rscale" : tensor<f32>
    %267 = torch_c.from_builtin_tensor %__auto.blk.15.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.15.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.15.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %268 = torch_c.from_builtin_tensor %__auto.blk.15.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.15.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.15.ffn_up.q_input:rscale" : tensor<f32>
    %269 = torch_c.from_builtin_tensor %__auto.blk.15.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.15.ffn_up.weight3Aqs = util.global.load @"__auto.blk.15.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %270 = torch_c.from_builtin_tensor %__auto.blk.15.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.15.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.15.ffn_down.q_input:rscale" : tensor<f32>
    %271 = torch_c.from_builtin_tensor %__auto.blk.15.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.15.ffn_down.weight3Aqs = util.global.load @"__auto.blk.15.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
    %272 = torch_c.from_builtin_tensor %__auto.blk.15.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %__auto.blk.16.attn_norm.weight = util.global.load @__auto.blk.16.attn_norm.weight : tensor<4096xbf16>
    %273 = torch_c.from_builtin_tensor %__auto.blk.16.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.16.attn_q.q_input3Arscale = util.global.load @"__auto.blk.16.attn_q.q_input:rscale" : tensor<f32>
    %274 = torch_c.from_builtin_tensor %__auto.blk.16.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.16.attn_q.weight3Aqs = util.global.load @"__auto.blk.16.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %275 = torch_c.from_builtin_tensor %__auto.blk.16.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.16.attn_k.q_input3Arscale = util.global.load @"__auto.blk.16.attn_k.q_input:rscale" : tensor<f32>
    %276 = torch_c.from_builtin_tensor %__auto.blk.16.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.16.attn_k.weight3Aqs = util.global.load @"__auto.blk.16.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %277 = torch_c.from_builtin_tensor %__auto.blk.16.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.16.attn_v.q_input3Arscale = util.global.load @"__auto.blk.16.attn_v.q_input:rscale" : tensor<f32>
    %278 = torch_c.from_builtin_tensor %__auto.blk.16.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.16.attn_v.weight3Aqs = util.global.load @"__auto.blk.16.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %279 = torch_c.from_builtin_tensor %__auto.blk.16.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.16.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.16.kv_cache.quantizer:rscale" : tensor<f32>
    %280 = torch_c.from_builtin_tensor %__auto.blk.16.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.16.attn_output.q_input3Arscale = util.global.load @"__auto.blk.16.attn_output.q_input:rscale" : tensor<f32>
    %281 = torch_c.from_builtin_tensor %__auto.blk.16.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.16.attn_output.weight3Aqs = util.global.load @"__auto.blk.16.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %282 = torch_c.from_builtin_tensor %__auto.blk.16.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.16.ffn_norm.weight = util.global.load @__auto.blk.16.ffn_norm.weight : tensor<4096xbf16>
    %283 = torch_c.from_builtin_tensor %__auto.blk.16.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.16.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.16.ffn_gate.q_input:rscale" : tensor<f32>
    %284 = torch_c.from_builtin_tensor %__auto.blk.16.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.16.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.16.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %285 = torch_c.from_builtin_tensor %__auto.blk.16.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.16.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.16.ffn_up.q_input:rscale" : tensor<f32>
    %286 = torch_c.from_builtin_tensor %__auto.blk.16.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.16.ffn_up.weight3Aqs = util.global.load @"__auto.blk.16.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %287 = torch_c.from_builtin_tensor %__auto.blk.16.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.16.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.16.ffn_down.q_input:rscale" : tensor<f32>
    %288 = torch_c.from_builtin_tensor %__auto.blk.16.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.16.ffn_down.weight3Aqs = util.global.load @"__auto.blk.16.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
    %289 = torch_c.from_builtin_tensor %__auto.blk.16.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %__auto.blk.17.attn_norm.weight = util.global.load @__auto.blk.17.attn_norm.weight : tensor<4096xbf16>
    %290 = torch_c.from_builtin_tensor %__auto.blk.17.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.17.attn_q.q_input3Arscale = util.global.load @"__auto.blk.17.attn_q.q_input:rscale" : tensor<f32>
    %291 = torch_c.from_builtin_tensor %__auto.blk.17.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.17.attn_q.weight3Aqs = util.global.load @"__auto.blk.17.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %292 = torch_c.from_builtin_tensor %__auto.blk.17.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.17.attn_k.q_input3Arscale = util.global.load @"__auto.blk.17.attn_k.q_input:rscale" : tensor<f32>
    %293 = torch_c.from_builtin_tensor %__auto.blk.17.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.17.attn_k.weight3Aqs = util.global.load @"__auto.blk.17.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %294 = torch_c.from_builtin_tensor %__auto.blk.17.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.17.attn_v.q_input3Arscale = util.global.load @"__auto.blk.17.attn_v.q_input:rscale" : tensor<f32>
    %295 = torch_c.from_builtin_tensor %__auto.blk.17.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.17.attn_v.weight3Aqs = util.global.load @"__auto.blk.17.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %296 = torch_c.from_builtin_tensor %__auto.blk.17.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.17.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.17.kv_cache.quantizer:rscale" : tensor<f32>
    %297 = torch_c.from_builtin_tensor %__auto.blk.17.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.17.attn_output.q_input3Arscale = util.global.load @"__auto.blk.17.attn_output.q_input:rscale" : tensor<f32>
    %298 = torch_c.from_builtin_tensor %__auto.blk.17.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.17.attn_output.weight3Aqs = util.global.load @"__auto.blk.17.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %299 = torch_c.from_builtin_tensor %__auto.blk.17.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.17.ffn_norm.weight = util.global.load @__auto.blk.17.ffn_norm.weight : tensor<4096xbf16>
    %300 = torch_c.from_builtin_tensor %__auto.blk.17.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.17.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.17.ffn_gate.q_input:rscale" : tensor<f32>
    %301 = torch_c.from_builtin_tensor %__auto.blk.17.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.17.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.17.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %302 = torch_c.from_builtin_tensor %__auto.blk.17.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.17.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.17.ffn_up.q_input:rscale" : tensor<f32>
    %303 = torch_c.from_builtin_tensor %__auto.blk.17.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.17.ffn_up.weight3Aqs = util.global.load @"__auto.blk.17.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %304 = torch_c.from_builtin_tensor %__auto.blk.17.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.17.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.17.ffn_down.q_input:rscale" : tensor<f32>
    %305 = torch_c.from_builtin_tensor %__auto.blk.17.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.17.ffn_down.weight3Aqs = util.global.load @"__auto.blk.17.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
    %306 = torch_c.from_builtin_tensor %__auto.blk.17.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %__auto.blk.18.attn_norm.weight = util.global.load @__auto.blk.18.attn_norm.weight : tensor<4096xbf16>
    %307 = torch_c.from_builtin_tensor %__auto.blk.18.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.18.attn_q.q_input3Arscale = util.global.load @"__auto.blk.18.attn_q.q_input:rscale" : tensor<f32>
    %308 = torch_c.from_builtin_tensor %__auto.blk.18.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.18.attn_q.weight3Aqs = util.global.load @"__auto.blk.18.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %309 = torch_c.from_builtin_tensor %__auto.blk.18.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.18.attn_k.q_input3Arscale = util.global.load @"__auto.blk.18.attn_k.q_input:rscale" : tensor<f32>
    %310 = torch_c.from_builtin_tensor %__auto.blk.18.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.18.attn_k.weight3Aqs = util.global.load @"__auto.blk.18.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %311 = torch_c.from_builtin_tensor %__auto.blk.18.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.18.attn_v.q_input3Arscale = util.global.load @"__auto.blk.18.attn_v.q_input:rscale" : tensor<f32>
    %312 = torch_c.from_builtin_tensor %__auto.blk.18.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.18.attn_v.weight3Aqs = util.global.load @"__auto.blk.18.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %313 = torch_c.from_builtin_tensor %__auto.blk.18.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.18.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.18.kv_cache.quantizer:rscale" : tensor<f32>
    %314 = torch_c.from_builtin_tensor %__auto.blk.18.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.18.attn_output.q_input3Arscale = util.global.load @"__auto.blk.18.attn_output.q_input:rscale" : tensor<f32>
    %315 = torch_c.from_builtin_tensor %__auto.blk.18.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.18.attn_output.weight3Aqs = util.global.load @"__auto.blk.18.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %316 = torch_c.from_builtin_tensor %__auto.blk.18.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.18.ffn_norm.weight = util.global.load @__auto.blk.18.ffn_norm.weight : tensor<4096xbf16>
    %317 = torch_c.from_builtin_tensor %__auto.blk.18.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.18.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.18.ffn_gate.q_input:rscale" : tensor<f32>
    %318 = torch_c.from_builtin_tensor %__auto.blk.18.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.18.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.18.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %319 = torch_c.from_builtin_tensor %__auto.blk.18.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.18.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.18.ffn_up.q_input:rscale" : tensor<f32>
    %320 = torch_c.from_builtin_tensor %__auto.blk.18.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.18.ffn_up.weight3Aqs = util.global.load @"__auto.blk.18.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %321 = torch_c.from_builtin_tensor %__auto.blk.18.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.18.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.18.ffn_down.q_input:rscale" : tensor<f32>
    %322 = torch_c.from_builtin_tensor %__auto.blk.18.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.18.ffn_down.weight3Aqs = util.global.load @"__auto.blk.18.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
    %323 = torch_c.from_builtin_tensor %__auto.blk.18.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %__auto.blk.19.attn_norm.weight = util.global.load @__auto.blk.19.attn_norm.weight : tensor<4096xbf16>
    %324 = torch_c.from_builtin_tensor %__auto.blk.19.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.19.attn_q.q_input3Arscale = util.global.load @"__auto.blk.19.attn_q.q_input:rscale" : tensor<f32>
    %325 = torch_c.from_builtin_tensor %__auto.blk.19.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.19.attn_q.weight3Aqs = util.global.load @"__auto.blk.19.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %326 = torch_c.from_builtin_tensor %__auto.blk.19.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.19.attn_k.q_input3Arscale = util.global.load @"__auto.blk.19.attn_k.q_input:rscale" : tensor<f32>
    %327 = torch_c.from_builtin_tensor %__auto.blk.19.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.19.attn_k.weight3Aqs = util.global.load @"__auto.blk.19.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %328 = torch_c.from_builtin_tensor %__auto.blk.19.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.19.attn_v.q_input3Arscale = util.global.load @"__auto.blk.19.attn_v.q_input:rscale" : tensor<f32>
    %329 = torch_c.from_builtin_tensor %__auto.blk.19.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.19.attn_v.weight3Aqs = util.global.load @"__auto.blk.19.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %330 = torch_c.from_builtin_tensor %__auto.blk.19.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.19.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.19.kv_cache.quantizer:rscale" : tensor<f32>
    %331 = torch_c.from_builtin_tensor %__auto.blk.19.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.19.attn_output.q_input3Arscale = util.global.load @"__auto.blk.19.attn_output.q_input:rscale" : tensor<f32>
    %332 = torch_c.from_builtin_tensor %__auto.blk.19.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.19.attn_output.weight3Aqs = util.global.load @"__auto.blk.19.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %333 = torch_c.from_builtin_tensor %__auto.blk.19.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.19.ffn_norm.weight = util.global.load @__auto.blk.19.ffn_norm.weight : tensor<4096xbf16>
    %334 = torch_c.from_builtin_tensor %__auto.blk.19.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.19.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.19.ffn_gate.q_input:rscale" : tensor<f32>
    %335 = torch_c.from_builtin_tensor %__auto.blk.19.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.19.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.19.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %336 = torch_c.from_builtin_tensor %__auto.blk.19.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.19.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.19.ffn_up.q_input:rscale" : tensor<f32>
    %337 = torch_c.from_builtin_tensor %__auto.blk.19.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.19.ffn_up.weight3Aqs = util.global.load @"__auto.blk.19.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %338 = torch_c.from_builtin_tensor %__auto.blk.19.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.19.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.19.ffn_down.q_input:rscale" : tensor<f32>
    %339 = torch_c.from_builtin_tensor %__auto.blk.19.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.19.ffn_down.weight3Aqs = util.global.load @"__auto.blk.19.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
    %340 = torch_c.from_builtin_tensor %__auto.blk.19.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %__auto.blk.20.attn_norm.weight = util.global.load @__auto.blk.20.attn_norm.weight : tensor<4096xbf16>
    %341 = torch_c.from_builtin_tensor %__auto.blk.20.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.20.attn_q.q_input3Arscale = util.global.load @"__auto.blk.20.attn_q.q_input:rscale" : tensor<f32>
    %342 = torch_c.from_builtin_tensor %__auto.blk.20.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.20.attn_q.weight3Aqs = util.global.load @"__auto.blk.20.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %343 = torch_c.from_builtin_tensor %__auto.blk.20.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.20.attn_k.q_input3Arscale = util.global.load @"__auto.blk.20.attn_k.q_input:rscale" : tensor<f32>
    %344 = torch_c.from_builtin_tensor %__auto.blk.20.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.20.attn_k.weight3Aqs = util.global.load @"__auto.blk.20.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %345 = torch_c.from_builtin_tensor %__auto.blk.20.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.20.attn_v.q_input3Arscale = util.global.load @"__auto.blk.20.attn_v.q_input:rscale" : tensor<f32>
    %346 = torch_c.from_builtin_tensor %__auto.blk.20.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.20.attn_v.weight3Aqs = util.global.load @"__auto.blk.20.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %347 = torch_c.from_builtin_tensor %__auto.blk.20.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.20.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.20.kv_cache.quantizer:rscale" : tensor<f32>
    %348 = torch_c.from_builtin_tensor %__auto.blk.20.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.20.attn_output.q_input3Arscale = util.global.load @"__auto.blk.20.attn_output.q_input:rscale" : tensor<f32>
    %349 = torch_c.from_builtin_tensor %__auto.blk.20.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.20.attn_output.weight3Aqs = util.global.load @"__auto.blk.20.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %350 = torch_c.from_builtin_tensor %__auto.blk.20.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.20.ffn_norm.weight = util.global.load @__auto.blk.20.ffn_norm.weight : tensor<4096xbf16>
    %351 = torch_c.from_builtin_tensor %__auto.blk.20.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.20.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.20.ffn_gate.q_input:rscale" : tensor<f32>
    %352 = torch_c.from_builtin_tensor %__auto.blk.20.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.20.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.20.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %353 = torch_c.from_builtin_tensor %__auto.blk.20.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.20.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.20.ffn_up.q_input:rscale" : tensor<f32>
    %354 = torch_c.from_builtin_tensor %__auto.blk.20.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.20.ffn_up.weight3Aqs = util.global.load @"__auto.blk.20.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %355 = torch_c.from_builtin_tensor %__auto.blk.20.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.20.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.20.ffn_down.q_input:rscale" : tensor<f32>
    %356 = torch_c.from_builtin_tensor %__auto.blk.20.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.20.ffn_down.weight3Aqs = util.global.load @"__auto.blk.20.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
    %357 = torch_c.from_builtin_tensor %__auto.blk.20.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %__auto.blk.21.attn_norm.weight = util.global.load @__auto.blk.21.attn_norm.weight : tensor<4096xbf16>
    %358 = torch_c.from_builtin_tensor %__auto.blk.21.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.21.attn_q.q_input3Arscale = util.global.load @"__auto.blk.21.attn_q.q_input:rscale" : tensor<f32>
    %359 = torch_c.from_builtin_tensor %__auto.blk.21.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.21.attn_q.weight3Aqs = util.global.load @"__auto.blk.21.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %360 = torch_c.from_builtin_tensor %__auto.blk.21.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.21.attn_k.q_input3Arscale = util.global.load @"__auto.blk.21.attn_k.q_input:rscale" : tensor<f32>
    %361 = torch_c.from_builtin_tensor %__auto.blk.21.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.21.attn_k.weight3Aqs = util.global.load @"__auto.blk.21.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %362 = torch_c.from_builtin_tensor %__auto.blk.21.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.21.attn_v.q_input3Arscale = util.global.load @"__auto.blk.21.attn_v.q_input:rscale" : tensor<f32>
    %363 = torch_c.from_builtin_tensor %__auto.blk.21.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.21.attn_v.weight3Aqs = util.global.load @"__auto.blk.21.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %364 = torch_c.from_builtin_tensor %__auto.blk.21.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.21.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.21.kv_cache.quantizer:rscale" : tensor<f32>
    %365 = torch_c.from_builtin_tensor %__auto.blk.21.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.21.attn_output.q_input3Arscale = util.global.load @"__auto.blk.21.attn_output.q_input:rscale" : tensor<f32>
    %366 = torch_c.from_builtin_tensor %__auto.blk.21.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.21.attn_output.weight3Aqs = util.global.load @"__auto.blk.21.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %367 = torch_c.from_builtin_tensor %__auto.blk.21.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.21.ffn_norm.weight = util.global.load @__auto.blk.21.ffn_norm.weight : tensor<4096xbf16>
    %368 = torch_c.from_builtin_tensor %__auto.blk.21.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.21.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.21.ffn_gate.q_input:rscale" : tensor<f32>
    %369 = torch_c.from_builtin_tensor %__auto.blk.21.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.21.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.21.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %370 = torch_c.from_builtin_tensor %__auto.blk.21.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.21.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.21.ffn_up.q_input:rscale" : tensor<f32>
    %371 = torch_c.from_builtin_tensor %__auto.blk.21.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.21.ffn_up.weight3Aqs = util.global.load @"__auto.blk.21.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %372 = torch_c.from_builtin_tensor %__auto.blk.21.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.21.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.21.ffn_down.q_input:rscale" : tensor<f32>
    %373 = torch_c.from_builtin_tensor %__auto.blk.21.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.21.ffn_down.weight3Aqs = util.global.load @"__auto.blk.21.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
    %374 = torch_c.from_builtin_tensor %__auto.blk.21.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %__auto.blk.22.attn_norm.weight = util.global.load @__auto.blk.22.attn_norm.weight : tensor<4096xbf16>
    %375 = torch_c.from_builtin_tensor %__auto.blk.22.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.22.attn_q.q_input3Arscale = util.global.load @"__auto.blk.22.attn_q.q_input:rscale" : tensor<f32>
    %376 = torch_c.from_builtin_tensor %__auto.blk.22.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.22.attn_q.weight3Aqs = util.global.load @"__auto.blk.22.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %377 = torch_c.from_builtin_tensor %__auto.blk.22.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.22.attn_k.q_input3Arscale = util.global.load @"__auto.blk.22.attn_k.q_input:rscale" : tensor<f32>
    %378 = torch_c.from_builtin_tensor %__auto.blk.22.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.22.attn_k.weight3Aqs = util.global.load @"__auto.blk.22.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %379 = torch_c.from_builtin_tensor %__auto.blk.22.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.22.attn_v.q_input3Arscale = util.global.load @"__auto.blk.22.attn_v.q_input:rscale" : tensor<f32>
    %380 = torch_c.from_builtin_tensor %__auto.blk.22.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.22.attn_v.weight3Aqs = util.global.load @"__auto.blk.22.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %381 = torch_c.from_builtin_tensor %__auto.blk.22.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.22.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.22.kv_cache.quantizer:rscale" : tensor<f32>
    %382 = torch_c.from_builtin_tensor %__auto.blk.22.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.22.attn_output.q_input3Arscale = util.global.load @"__auto.blk.22.attn_output.q_input:rscale" : tensor<f32>
    %383 = torch_c.from_builtin_tensor %__auto.blk.22.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.22.attn_output.weight3Aqs = util.global.load @"__auto.blk.22.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %384 = torch_c.from_builtin_tensor %__auto.blk.22.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.22.ffn_norm.weight = util.global.load @__auto.blk.22.ffn_norm.weight : tensor<4096xbf16>
    %385 = torch_c.from_builtin_tensor %__auto.blk.22.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.22.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.22.ffn_gate.q_input:rscale" : tensor<f32>
    %386 = torch_c.from_builtin_tensor %__auto.blk.22.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.22.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.22.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %387 = torch_c.from_builtin_tensor %__auto.blk.22.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.22.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.22.ffn_up.q_input:rscale" : tensor<f32>
    %388 = torch_c.from_builtin_tensor %__auto.blk.22.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.22.ffn_up.weight3Aqs = util.global.load @"__auto.blk.22.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %389 = torch_c.from_builtin_tensor %__auto.blk.22.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.22.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.22.ffn_down.q_input:rscale" : tensor<f32>
    %390 = torch_c.from_builtin_tensor %__auto.blk.22.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.22.ffn_down.weight3Aqs = util.global.load @"__auto.blk.22.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
    %391 = torch_c.from_builtin_tensor %__auto.blk.22.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %__auto.blk.23.attn_norm.weight = util.global.load @__auto.blk.23.attn_norm.weight : tensor<4096xbf16>
    %392 = torch_c.from_builtin_tensor %__auto.blk.23.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.23.attn_q.q_input3Arscale = util.global.load @"__auto.blk.23.attn_q.q_input:rscale" : tensor<f32>
    %393 = torch_c.from_builtin_tensor %__auto.blk.23.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.23.attn_q.weight3Aqs = util.global.load @"__auto.blk.23.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %394 = torch_c.from_builtin_tensor %__auto.blk.23.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.23.attn_k.q_input3Arscale = util.global.load @"__auto.blk.23.attn_k.q_input:rscale" : tensor<f32>
    %395 = torch_c.from_builtin_tensor %__auto.blk.23.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.23.attn_k.weight3Aqs = util.global.load @"__auto.blk.23.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %396 = torch_c.from_builtin_tensor %__auto.blk.23.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.23.attn_v.q_input3Arscale = util.global.load @"__auto.blk.23.attn_v.q_input:rscale" : tensor<f32>
    %397 = torch_c.from_builtin_tensor %__auto.blk.23.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.23.attn_v.weight3Aqs = util.global.load @"__auto.blk.23.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %398 = torch_c.from_builtin_tensor %__auto.blk.23.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.23.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.23.kv_cache.quantizer:rscale" : tensor<f32>
    %399 = torch_c.from_builtin_tensor %__auto.blk.23.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.23.attn_output.q_input3Arscale = util.global.load @"__auto.blk.23.attn_output.q_input:rscale" : tensor<f32>
    %400 = torch_c.from_builtin_tensor %__auto.blk.23.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.23.attn_output.weight3Aqs = util.global.load @"__auto.blk.23.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %401 = torch_c.from_builtin_tensor %__auto.blk.23.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.23.ffn_norm.weight = util.global.load @__auto.blk.23.ffn_norm.weight : tensor<4096xbf16>
    %402 = torch_c.from_builtin_tensor %__auto.blk.23.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.23.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.23.ffn_gate.q_input:rscale" : tensor<f32>
    %403 = torch_c.from_builtin_tensor %__auto.blk.23.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.23.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.23.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %404 = torch_c.from_builtin_tensor %__auto.blk.23.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.23.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.23.ffn_up.q_input:rscale" : tensor<f32>
    %405 = torch_c.from_builtin_tensor %__auto.blk.23.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.23.ffn_up.weight3Aqs = util.global.load @"__auto.blk.23.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %406 = torch_c.from_builtin_tensor %__auto.blk.23.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.23.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.23.ffn_down.q_input:rscale" : tensor<f32>
    %407 = torch_c.from_builtin_tensor %__auto.blk.23.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.23.ffn_down.weight3Aqs = util.global.load @"__auto.blk.23.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
    %408 = torch_c.from_builtin_tensor %__auto.blk.23.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %__auto.blk.24.attn_norm.weight = util.global.load @__auto.blk.24.attn_norm.weight : tensor<4096xbf16>
    %409 = torch_c.from_builtin_tensor %__auto.blk.24.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.24.attn_q.q_input3Arscale = util.global.load @"__auto.blk.24.attn_q.q_input:rscale" : tensor<f32>
    %410 = torch_c.from_builtin_tensor %__auto.blk.24.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.24.attn_q.weight3Aqs = util.global.load @"__auto.blk.24.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %411 = torch_c.from_builtin_tensor %__auto.blk.24.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.24.attn_k.q_input3Arscale = util.global.load @"__auto.blk.24.attn_k.q_input:rscale" : tensor<f32>
    %412 = torch_c.from_builtin_tensor %__auto.blk.24.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.24.attn_k.weight3Aqs = util.global.load @"__auto.blk.24.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %413 = torch_c.from_builtin_tensor %__auto.blk.24.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.24.attn_v.q_input3Arscale = util.global.load @"__auto.blk.24.attn_v.q_input:rscale" : tensor<f32>
    %414 = torch_c.from_builtin_tensor %__auto.blk.24.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.24.attn_v.weight3Aqs = util.global.load @"__auto.blk.24.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %415 = torch_c.from_builtin_tensor %__auto.blk.24.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.24.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.24.kv_cache.quantizer:rscale" : tensor<f32>
    %416 = torch_c.from_builtin_tensor %__auto.blk.24.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.24.attn_output.q_input3Arscale = util.global.load @"__auto.blk.24.attn_output.q_input:rscale" : tensor<f32>
    %417 = torch_c.from_builtin_tensor %__auto.blk.24.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.24.attn_output.weight3Aqs = util.global.load @"__auto.blk.24.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %418 = torch_c.from_builtin_tensor %__auto.blk.24.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.24.ffn_norm.weight = util.global.load @__auto.blk.24.ffn_norm.weight : tensor<4096xbf16>
    %419 = torch_c.from_builtin_tensor %__auto.blk.24.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.24.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.24.ffn_gate.q_input:rscale" : tensor<f32>
    %420 = torch_c.from_builtin_tensor %__auto.blk.24.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.24.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.24.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %421 = torch_c.from_builtin_tensor %__auto.blk.24.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.24.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.24.ffn_up.q_input:rscale" : tensor<f32>
    %422 = torch_c.from_builtin_tensor %__auto.blk.24.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.24.ffn_up.weight3Aqs = util.global.load @"__auto.blk.24.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %423 = torch_c.from_builtin_tensor %__auto.blk.24.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.24.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.24.ffn_down.q_input:rscale" : tensor<f32>
    %424 = torch_c.from_builtin_tensor %__auto.blk.24.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.24.ffn_down.weight3Aqs = util.global.load @"__auto.blk.24.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
    %425 = torch_c.from_builtin_tensor %__auto.blk.24.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %__auto.blk.25.attn_norm.weight = util.global.load @__auto.blk.25.attn_norm.weight : tensor<4096xbf16>
    %426 = torch_c.from_builtin_tensor %__auto.blk.25.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.25.attn_q.q_input3Arscale = util.global.load @"__auto.blk.25.attn_q.q_input:rscale" : tensor<f32>
    %427 = torch_c.from_builtin_tensor %__auto.blk.25.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.25.attn_q.weight3Aqs = util.global.load @"__auto.blk.25.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %428 = torch_c.from_builtin_tensor %__auto.blk.25.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.25.attn_k.q_input3Arscale = util.global.load @"__auto.blk.25.attn_k.q_input:rscale" : tensor<f32>
    %429 = torch_c.from_builtin_tensor %__auto.blk.25.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.25.attn_k.weight3Aqs = util.global.load @"__auto.blk.25.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %430 = torch_c.from_builtin_tensor %__auto.blk.25.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.25.attn_v.q_input3Arscale = util.global.load @"__auto.blk.25.attn_v.q_input:rscale" : tensor<f32>
    %431 = torch_c.from_builtin_tensor %__auto.blk.25.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.25.attn_v.weight3Aqs = util.global.load @"__auto.blk.25.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %432 = torch_c.from_builtin_tensor %__auto.blk.25.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.25.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.25.kv_cache.quantizer:rscale" : tensor<f32>
    %433 = torch_c.from_builtin_tensor %__auto.blk.25.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.25.attn_output.q_input3Arscale = util.global.load @"__auto.blk.25.attn_output.q_input:rscale" : tensor<f32>
    %434 = torch_c.from_builtin_tensor %__auto.blk.25.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.25.attn_output.weight3Aqs = util.global.load @"__auto.blk.25.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %435 = torch_c.from_builtin_tensor %__auto.blk.25.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.25.ffn_norm.weight = util.global.load @__auto.blk.25.ffn_norm.weight : tensor<4096xbf16>
    %436 = torch_c.from_builtin_tensor %__auto.blk.25.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.25.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.25.ffn_gate.q_input:rscale" : tensor<f32>
    %437 = torch_c.from_builtin_tensor %__auto.blk.25.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.25.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.25.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %438 = torch_c.from_builtin_tensor %__auto.blk.25.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.25.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.25.ffn_up.q_input:rscale" : tensor<f32>
    %439 = torch_c.from_builtin_tensor %__auto.blk.25.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.25.ffn_up.weight3Aqs = util.global.load @"__auto.blk.25.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %440 = torch_c.from_builtin_tensor %__auto.blk.25.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.25.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.25.ffn_down.q_input:rscale" : tensor<f32>
    %441 = torch_c.from_builtin_tensor %__auto.blk.25.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.25.ffn_down.weight3Aqs = util.global.load @"__auto.blk.25.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
    %442 = torch_c.from_builtin_tensor %__auto.blk.25.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %__auto.blk.26.attn_norm.weight = util.global.load @__auto.blk.26.attn_norm.weight : tensor<4096xbf16>
    %443 = torch_c.from_builtin_tensor %__auto.blk.26.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.26.attn_q.q_input3Arscale = util.global.load @"__auto.blk.26.attn_q.q_input:rscale" : tensor<f32>
    %444 = torch_c.from_builtin_tensor %__auto.blk.26.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.26.attn_q.weight3Aqs = util.global.load @"__auto.blk.26.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %445 = torch_c.from_builtin_tensor %__auto.blk.26.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.26.attn_k.q_input3Arscale = util.global.load @"__auto.blk.26.attn_k.q_input:rscale" : tensor<f32>
    %446 = torch_c.from_builtin_tensor %__auto.blk.26.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.26.attn_k.weight3Aqs = util.global.load @"__auto.blk.26.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %447 = torch_c.from_builtin_tensor %__auto.blk.26.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.26.attn_v.q_input3Arscale = util.global.load @"__auto.blk.26.attn_v.q_input:rscale" : tensor<f32>
    %448 = torch_c.from_builtin_tensor %__auto.blk.26.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.26.attn_v.weight3Aqs = util.global.load @"__auto.blk.26.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %449 = torch_c.from_builtin_tensor %__auto.blk.26.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.26.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.26.kv_cache.quantizer:rscale" : tensor<f32>
    %450 = torch_c.from_builtin_tensor %__auto.blk.26.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.26.attn_output.q_input3Arscale = util.global.load @"__auto.blk.26.attn_output.q_input:rscale" : tensor<f32>
    %451 = torch_c.from_builtin_tensor %__auto.blk.26.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.26.attn_output.weight3Aqs = util.global.load @"__auto.blk.26.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %452 = torch_c.from_builtin_tensor %__auto.blk.26.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.26.ffn_norm.weight = util.global.load @__auto.blk.26.ffn_norm.weight : tensor<4096xbf16>
    %453 = torch_c.from_builtin_tensor %__auto.blk.26.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.26.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.26.ffn_gate.q_input:rscale" : tensor<f32>
    %454 = torch_c.from_builtin_tensor %__auto.blk.26.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.26.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.26.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %455 = torch_c.from_builtin_tensor %__auto.blk.26.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.26.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.26.ffn_up.q_input:rscale" : tensor<f32>
    %456 = torch_c.from_builtin_tensor %__auto.blk.26.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.26.ffn_up.weight3Aqs = util.global.load @"__auto.blk.26.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %457 = torch_c.from_builtin_tensor %__auto.blk.26.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.26.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.26.ffn_down.q_input:rscale" : tensor<f32>
    %458 = torch_c.from_builtin_tensor %__auto.blk.26.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.26.ffn_down.weight3Aqs = util.global.load @"__auto.blk.26.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
    %459 = torch_c.from_builtin_tensor %__auto.blk.26.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %__auto.blk.27.attn_norm.weight = util.global.load @__auto.blk.27.attn_norm.weight : tensor<4096xbf16>
    %460 = torch_c.from_builtin_tensor %__auto.blk.27.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.27.attn_q.q_input3Arscale = util.global.load @"__auto.blk.27.attn_q.q_input:rscale" : tensor<f32>
    %461 = torch_c.from_builtin_tensor %__auto.blk.27.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.27.attn_q.weight3Aqs = util.global.load @"__auto.blk.27.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %462 = torch_c.from_builtin_tensor %__auto.blk.27.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.27.attn_k.q_input3Arscale = util.global.load @"__auto.blk.27.attn_k.q_input:rscale" : tensor<f32>
    %463 = torch_c.from_builtin_tensor %__auto.blk.27.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.27.attn_k.weight3Aqs = util.global.load @"__auto.blk.27.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %464 = torch_c.from_builtin_tensor %__auto.blk.27.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.27.attn_v.q_input3Arscale = util.global.load @"__auto.blk.27.attn_v.q_input:rscale" : tensor<f32>
    %465 = torch_c.from_builtin_tensor %__auto.blk.27.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.27.attn_v.weight3Aqs = util.global.load @"__auto.blk.27.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %466 = torch_c.from_builtin_tensor %__auto.blk.27.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.27.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.27.kv_cache.quantizer:rscale" : tensor<f32>
    %467 = torch_c.from_builtin_tensor %__auto.blk.27.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.27.attn_output.q_input3Arscale = util.global.load @"__auto.blk.27.attn_output.q_input:rscale" : tensor<f32>
    %468 = torch_c.from_builtin_tensor %__auto.blk.27.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.27.attn_output.weight3Aqs = util.global.load @"__auto.blk.27.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %469 = torch_c.from_builtin_tensor %__auto.blk.27.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.27.ffn_norm.weight = util.global.load @__auto.blk.27.ffn_norm.weight : tensor<4096xbf16>
    %470 = torch_c.from_builtin_tensor %__auto.blk.27.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.27.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.27.ffn_gate.q_input:rscale" : tensor<f32>
    %471 = torch_c.from_builtin_tensor %__auto.blk.27.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.27.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.27.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %472 = torch_c.from_builtin_tensor %__auto.blk.27.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.27.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.27.ffn_up.q_input:rscale" : tensor<f32>
    %473 = torch_c.from_builtin_tensor %__auto.blk.27.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.27.ffn_up.weight3Aqs = util.global.load @"__auto.blk.27.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %474 = torch_c.from_builtin_tensor %__auto.blk.27.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.27.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.27.ffn_down.q_input:rscale" : tensor<f32>
    %475 = torch_c.from_builtin_tensor %__auto.blk.27.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.27.ffn_down.weight3Aqs = util.global.load @"__auto.blk.27.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
    %476 = torch_c.from_builtin_tensor %__auto.blk.27.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %__auto.blk.28.attn_norm.weight = util.global.load @__auto.blk.28.attn_norm.weight : tensor<4096xbf16>
    %477 = torch_c.from_builtin_tensor %__auto.blk.28.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.28.attn_q.q_input3Arscale = util.global.load @"__auto.blk.28.attn_q.q_input:rscale" : tensor<f32>
    %478 = torch_c.from_builtin_tensor %__auto.blk.28.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.28.attn_q.weight3Aqs = util.global.load @"__auto.blk.28.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %479 = torch_c.from_builtin_tensor %__auto.blk.28.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.28.attn_k.q_input3Arscale = util.global.load @"__auto.blk.28.attn_k.q_input:rscale" : tensor<f32>
    %480 = torch_c.from_builtin_tensor %__auto.blk.28.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.28.attn_k.weight3Aqs = util.global.load @"__auto.blk.28.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %481 = torch_c.from_builtin_tensor %__auto.blk.28.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.28.attn_v.q_input3Arscale = util.global.load @"__auto.blk.28.attn_v.q_input:rscale" : tensor<f32>
    %482 = torch_c.from_builtin_tensor %__auto.blk.28.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.28.attn_v.weight3Aqs = util.global.load @"__auto.blk.28.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %483 = torch_c.from_builtin_tensor %__auto.blk.28.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.28.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.28.kv_cache.quantizer:rscale" : tensor<f32>
    %484 = torch_c.from_builtin_tensor %__auto.blk.28.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.28.attn_output.q_input3Arscale = util.global.load @"__auto.blk.28.attn_output.q_input:rscale" : tensor<f32>
    %485 = torch_c.from_builtin_tensor %__auto.blk.28.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.28.attn_output.weight3Aqs = util.global.load @"__auto.blk.28.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %486 = torch_c.from_builtin_tensor %__auto.blk.28.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.28.ffn_norm.weight = util.global.load @__auto.blk.28.ffn_norm.weight : tensor<4096xbf16>
    %487 = torch_c.from_builtin_tensor %__auto.blk.28.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.28.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.28.ffn_gate.q_input:rscale" : tensor<f32>
    %488 = torch_c.from_builtin_tensor %__auto.blk.28.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.28.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.28.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %489 = torch_c.from_builtin_tensor %__auto.blk.28.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.28.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.28.ffn_up.q_input:rscale" : tensor<f32>
    %490 = torch_c.from_builtin_tensor %__auto.blk.28.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.28.ffn_up.weight3Aqs = util.global.load @"__auto.blk.28.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %491 = torch_c.from_builtin_tensor %__auto.blk.28.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.28.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.28.ffn_down.q_input:rscale" : tensor<f32>
    %492 = torch_c.from_builtin_tensor %__auto.blk.28.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.28.ffn_down.weight3Aqs = util.global.load @"__auto.blk.28.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
    %493 = torch_c.from_builtin_tensor %__auto.blk.28.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %__auto.blk.29.attn_norm.weight = util.global.load @__auto.blk.29.attn_norm.weight : tensor<4096xbf16>
    %494 = torch_c.from_builtin_tensor %__auto.blk.29.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.29.attn_q.q_input3Arscale = util.global.load @"__auto.blk.29.attn_q.q_input:rscale" : tensor<f32>
    %495 = torch_c.from_builtin_tensor %__auto.blk.29.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.29.attn_q.weight3Aqs = util.global.load @"__auto.blk.29.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %496 = torch_c.from_builtin_tensor %__auto.blk.29.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.29.attn_k.q_input3Arscale = util.global.load @"__auto.blk.29.attn_k.q_input:rscale" : tensor<f32>
    %497 = torch_c.from_builtin_tensor %__auto.blk.29.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.29.attn_k.weight3Aqs = util.global.load @"__auto.blk.29.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %498 = torch_c.from_builtin_tensor %__auto.blk.29.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.29.attn_v.q_input3Arscale = util.global.load @"__auto.blk.29.attn_v.q_input:rscale" : tensor<f32>
    %499 = torch_c.from_builtin_tensor %__auto.blk.29.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.29.attn_v.weight3Aqs = util.global.load @"__auto.blk.29.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %500 = torch_c.from_builtin_tensor %__auto.blk.29.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.29.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.29.kv_cache.quantizer:rscale" : tensor<f32>
    %501 = torch_c.from_builtin_tensor %__auto.blk.29.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.29.attn_output.q_input3Arscale = util.global.load @"__auto.blk.29.attn_output.q_input:rscale" : tensor<f32>
    %502 = torch_c.from_builtin_tensor %__auto.blk.29.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.29.attn_output.weight3Aqs = util.global.load @"__auto.blk.29.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %503 = torch_c.from_builtin_tensor %__auto.blk.29.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.29.ffn_norm.weight = util.global.load @__auto.blk.29.ffn_norm.weight : tensor<4096xbf16>
    %504 = torch_c.from_builtin_tensor %__auto.blk.29.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.29.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.29.ffn_gate.q_input:rscale" : tensor<f32>
    %505 = torch_c.from_builtin_tensor %__auto.blk.29.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.29.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.29.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %506 = torch_c.from_builtin_tensor %__auto.blk.29.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.29.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.29.ffn_up.q_input:rscale" : tensor<f32>
    %507 = torch_c.from_builtin_tensor %__auto.blk.29.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.29.ffn_up.weight3Aqs = util.global.load @"__auto.blk.29.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %508 = torch_c.from_builtin_tensor %__auto.blk.29.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.29.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.29.ffn_down.q_input:rscale" : tensor<f32>
    %509 = torch_c.from_builtin_tensor %__auto.blk.29.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.29.ffn_down.weight3Aqs = util.global.load @"__auto.blk.29.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
    %510 = torch_c.from_builtin_tensor %__auto.blk.29.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %__auto.blk.30.attn_norm.weight = util.global.load @__auto.blk.30.attn_norm.weight : tensor<4096xbf16>
    %511 = torch_c.from_builtin_tensor %__auto.blk.30.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.30.attn_q.q_input3Arscale = util.global.load @"__auto.blk.30.attn_q.q_input:rscale" : tensor<f32>
    %512 = torch_c.from_builtin_tensor %__auto.blk.30.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.30.attn_q.weight3Aqs = util.global.load @"__auto.blk.30.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %513 = torch_c.from_builtin_tensor %__auto.blk.30.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.30.attn_k.q_input3Arscale = util.global.load @"__auto.blk.30.attn_k.q_input:rscale" : tensor<f32>
    %514 = torch_c.from_builtin_tensor %__auto.blk.30.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.30.attn_k.weight3Aqs = util.global.load @"__auto.blk.30.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %515 = torch_c.from_builtin_tensor %__auto.blk.30.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.30.attn_v.q_input3Arscale = util.global.load @"__auto.blk.30.attn_v.q_input:rscale" : tensor<f32>
    %516 = torch_c.from_builtin_tensor %__auto.blk.30.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.30.attn_v.weight3Aqs = util.global.load @"__auto.blk.30.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %517 = torch_c.from_builtin_tensor %__auto.blk.30.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.30.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.30.kv_cache.quantizer:rscale" : tensor<f32>
    %518 = torch_c.from_builtin_tensor %__auto.blk.30.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.30.attn_output.q_input3Arscale = util.global.load @"__auto.blk.30.attn_output.q_input:rscale" : tensor<f32>
    %519 = torch_c.from_builtin_tensor %__auto.blk.30.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.30.attn_output.weight3Aqs = util.global.load @"__auto.blk.30.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %520 = torch_c.from_builtin_tensor %__auto.blk.30.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.30.ffn_norm.weight = util.global.load @__auto.blk.30.ffn_norm.weight : tensor<4096xbf16>
    %521 = torch_c.from_builtin_tensor %__auto.blk.30.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.30.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.30.ffn_gate.q_input:rscale" : tensor<f32>
    %522 = torch_c.from_builtin_tensor %__auto.blk.30.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.30.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.30.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %523 = torch_c.from_builtin_tensor %__auto.blk.30.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.30.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.30.ffn_up.q_input:rscale" : tensor<f32>
    %524 = torch_c.from_builtin_tensor %__auto.blk.30.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.30.ffn_up.weight3Aqs = util.global.load @"__auto.blk.30.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %525 = torch_c.from_builtin_tensor %__auto.blk.30.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.30.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.30.ffn_down.q_input:rscale" : tensor<f32>
    %526 = torch_c.from_builtin_tensor %__auto.blk.30.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.30.ffn_down.weight3Aqs = util.global.load @"__auto.blk.30.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
    %527 = torch_c.from_builtin_tensor %__auto.blk.30.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %__auto.blk.31.attn_norm.weight = util.global.load @__auto.blk.31.attn_norm.weight : tensor<4096xbf16>
    %528 = torch_c.from_builtin_tensor %__auto.blk.31.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.31.attn_q.q_input3Arscale = util.global.load @"__auto.blk.31.attn_q.q_input:rscale" : tensor<f32>
    %529 = torch_c.from_builtin_tensor %__auto.blk.31.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.31.attn_q.weight3Aqs = util.global.load @"__auto.blk.31.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %530 = torch_c.from_builtin_tensor %__auto.blk.31.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.31.attn_k.q_input3Arscale = util.global.load @"__auto.blk.31.attn_k.q_input:rscale" : tensor<f32>
    %531 = torch_c.from_builtin_tensor %__auto.blk.31.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.31.attn_k.weight3Aqs = util.global.load @"__auto.blk.31.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %532 = torch_c.from_builtin_tensor %__auto.blk.31.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.31.attn_v.q_input3Arscale = util.global.load @"__auto.blk.31.attn_v.q_input:rscale" : tensor<f32>
    %533 = torch_c.from_builtin_tensor %__auto.blk.31.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.31.attn_v.weight3Aqs = util.global.load @"__auto.blk.31.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
    %534 = torch_c.from_builtin_tensor %__auto.blk.31.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
    %__auto.blk.31.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.31.kv_cache.quantizer:rscale" : tensor<f32>
    %535 = torch_c.from_builtin_tensor %__auto.blk.31.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.31.attn_output.q_input3Arscale = util.global.load @"__auto.blk.31.attn_output.q_input:rscale" : tensor<f32>
    %536 = torch_c.from_builtin_tensor %__auto.blk.31.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.31.attn_output.weight3Aqs = util.global.load @"__auto.blk.31.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
    %537 = torch_c.from_builtin_tensor %__auto.blk.31.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %__auto.blk.31.ffn_norm.weight = util.global.load @__auto.blk.31.ffn_norm.weight : tensor<4096xbf16>
    %538 = torch_c.from_builtin_tensor %__auto.blk.31.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.blk.31.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.31.ffn_gate.q_input:rscale" : tensor<f32>
    %539 = torch_c.from_builtin_tensor %__auto.blk.31.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.31.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.31.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %540 = torch_c.from_builtin_tensor %__auto.blk.31.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.31.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.31.ffn_up.q_input:rscale" : tensor<f32>
    %541 = torch_c.from_builtin_tensor %__auto.blk.31.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.31.ffn_up.weight3Aqs = util.global.load @"__auto.blk.31.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
    %542 = torch_c.from_builtin_tensor %__auto.blk.31.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %__auto.blk.31.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.31.ffn_down.q_input:rscale" : tensor<f32>
    %543 = torch_c.from_builtin_tensor %__auto.blk.31.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
    %__auto.blk.31.ffn_down.weight3Aqs = util.global.load @"__auto.blk.31.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
    %544 = torch_c.from_builtin_tensor %__auto.blk.31.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %__auto.output_norm.weight = util.global.load @__auto.output_norm.weight : tensor<4096xbf16>
    %545 = torch_c.from_builtin_tensor %__auto.output_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
    %__auto.output.weight = util.global.load @__auto.output.weight : tensor<128256x4096xbf16>
    %546 = torch_c.from_builtin_tensor %__auto.output.weight : tensor<128256x4096xbf16> -> !torch.vtensor<[128256,4096],bf16>
    %547 = torch.copy.to_vtensor %arg3 : !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    %548 = torch.symbolic_int "s1" {min_val = 2, max_val = 4095} : !torch.int
    %549 = torch.symbolic_int "s2" {min_val = 2, max_val = 9223372036854775806} : !torch.int
    torch.bind_symbolic_shape %arg0, [%548], affine_map<()[s0] -> (1, s0 * 32)> : !torch.vtensor<[1,?],si64>
    torch.bind_symbolic_shape %arg2, [%548], affine_map<()[s0] -> (1, s0)> : !torch.vtensor<[1,?],si64>
    torch.bind_symbolic_shape %547, [%549], affine_map<()[s0] -> (s0, 2097152)> : !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    %int-1 = torch.constant.int -1
    %false = torch.constant.bool false
    %false_0 = torch.constant.bool false
    %550 = torch.aten.embedding %0, %arg0, %int-1, %false, %false_0 : !torch.vtensor<[128256,4096],bf16>, !torch.vtensor<[1,?],si64>, !torch.int, !torch.bool, !torch.bool -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %550, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int6 = torch.constant.int 6
    %551 = torch.prims.convert_element_type %550, %int6 : !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %551, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int2 = torch.constant.int 2
    %552 = torch.aten.pow.Tensor_Scalar %551, %int2 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %552, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int-1_1 = torch.constant.int -1
    %553 = torch.prim.ListConstruct %int-1_1 : (!torch.int) -> !torch.list<int>
    %true = torch.constant.bool true
    %none = torch.constant.none
    %554 = torch.aten.mean.dim %552, %553, %true, %none : !torch.vtensor<[1,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %554, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %float1.000000e-05 = torch.constant.float 1.000000e-05
    %int1 = torch.constant.int 1
    %555 = torch.aten.add.Scalar %554, %float1.000000e-05, %int1 : !torch.vtensor<[1,?,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %555, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %556 = torch.aten.rsqrt %555 : !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %556, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %557 = torch.aten.mul.Tensor %551, %556 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %557, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int15 = torch.constant.int 15
    %558 = torch.prims.convert_element_type %557, %int15 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %558, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %559 = torch.aten.mul.Tensor %1, %558 : !torch.vtensor<[4096],bf16>, !torch.vtensor<[1,?,4096],bf16> -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %559, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %560 = torch.aten.div.Tensor %559, %2 : !torch.vtensor<[1,?,4096],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %560, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %float-2.400000e02 = torch.constant.float -2.400000e+02
    %float2.400000e02 = torch.constant.float 2.400000e+02
    %561 = torch.aten.clamp %560, %float-2.400000e02, %float2.400000e02 : !torch.vtensor<[1,?,4096],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %561, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int26 = torch.constant.int 26
    %562 = torch.prims.convert_element_type %561, %int26 : !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %562, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2 = torch.constant.int -2
    %int-1_2 = torch.constant.int -1
    %563 = torch.aten.transpose.int %3, %int-2, %int-1_2 : !torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %int1_3 = torch.constant.int 1
    %564 = torch.aten.size.int %arg0, %int1_3 : !torch.vtensor<[1,?],si64>, !torch.int -> !torch.int
    %int4096 = torch.constant.int 4096
    %565 = torch.prim.ListConstruct %564, %int4096 : (!torch.int, !torch.int) -> !torch.list<int>
    %566 = torch.aten.view %562, %565 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %566, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %567 = torch.aten.mm %566, %563 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %567, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %int1_4 = torch.constant.int 1
    %int4096_5 = torch.constant.int 4096
    %568 = torch.prim.ListConstruct %int1_4, %564, %int4096_5 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %569 = torch.aten.view %567, %568 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %569, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int15_6 = torch.constant.int 15
    %570 = torch.prims.convert_element_type %569, %int15_6 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %570, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %571 = torch.aten.div.Tensor %559, %4 : !torch.vtensor<[1,?,4096],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %571, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %float-2.400000e02_7 = torch.constant.float -2.400000e+02
    %float2.400000e02_8 = torch.constant.float 2.400000e+02
    %572 = torch.aten.clamp %571, %float-2.400000e02_7, %float2.400000e02_8 : !torch.vtensor<[1,?,4096],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %572, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int26_9 = torch.constant.int 26
    %573 = torch.prims.convert_element_type %572, %int26_9 : !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %573, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_10 = torch.constant.int -2
    %int-1_11 = torch.constant.int -1
    %574 = torch.aten.transpose.int %5, %int-2_10, %int-1_11 : !torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,1024],f8E4M3FNUZ>
    %int4096_12 = torch.constant.int 4096
    %575 = torch.prim.ListConstruct %564, %int4096_12 : (!torch.int, !torch.int) -> !torch.list<int>
    %576 = torch.aten.view %573, %575 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %576, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %577 = torch.aten.mm %576, %574 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,1024],f8E4M3FNUZ> -> !torch.vtensor<[?,1024],f8E4M3FNUZ>
    torch.bind_symbolic_shape %577, [%548], affine_map<()[s0] -> (s0 * 32, 1024)> : !torch.vtensor<[?,1024],f8E4M3FNUZ>
    %int1_13 = torch.constant.int 1
    %int1024 = torch.constant.int 1024
    %578 = torch.prim.ListConstruct %int1_13, %564, %int1024 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %579 = torch.aten.view %577, %578 : !torch.vtensor<[?,1024],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
    torch.bind_symbolic_shape %579, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
    %int15_14 = torch.constant.int 15
    %580 = torch.prims.convert_element_type %579, %int15_14 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,1024],bf16>
    torch.bind_symbolic_shape %580, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],bf16>
    %581 = torch.aten.div.Tensor %559, %6 : !torch.vtensor<[1,?,4096],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %581, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %float-2.400000e02_15 = torch.constant.float -2.400000e+02
    %float2.400000e02_16 = torch.constant.float 2.400000e+02
    %582 = torch.aten.clamp %581, %float-2.400000e02_15, %float2.400000e02_16 : !torch.vtensor<[1,?,4096],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %582, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int26_17 = torch.constant.int 26
    %583 = torch.prims.convert_element_type %582, %int26_17 : !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %583, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_18 = torch.constant.int -2
    %int-1_19 = torch.constant.int -1
    %584 = torch.aten.transpose.int %7, %int-2_18, %int-1_19 : !torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,1024],f8E4M3FNUZ>
    %int4096_20 = torch.constant.int 4096
    %585 = torch.prim.ListConstruct %564, %int4096_20 : (!torch.int, !torch.int) -> !torch.list<int>
    %586 = torch.aten.view %583, %585 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %586, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %587 = torch.aten.mm %586, %584 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,1024],f8E4M3FNUZ> -> !torch.vtensor<[?,1024],f8E4M3FNUZ>
    torch.bind_symbolic_shape %587, [%548], affine_map<()[s0] -> (s0 * 32, 1024)> : !torch.vtensor<[?,1024],f8E4M3FNUZ>
    %int1_21 = torch.constant.int 1
    %int1024_22 = torch.constant.int 1024
    %588 = torch.prim.ListConstruct %int1_21, %564, %int1024_22 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %589 = torch.aten.view %587, %588 : !torch.vtensor<[?,1024],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
    torch.bind_symbolic_shape %589, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
    %int15_23 = torch.constant.int 15
    %590 = torch.prims.convert_element_type %589, %int15_23 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,1024],bf16>
    torch.bind_symbolic_shape %590, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],bf16>
    %int1_24 = torch.constant.int 1
    %int32 = torch.constant.int 32
    %int128 = torch.constant.int 128
    %591 = torch.prim.ListConstruct %int1_24, %564, %int32, %int128 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %592 = torch.aten.view %570, %591 : !torch.vtensor<[1,?,4096],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,32,128],bf16>
    torch.bind_symbolic_shape %592, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
    %int1_25 = torch.constant.int 1
    %int8 = torch.constant.int 8
    %int128_26 = torch.constant.int 128
    %593 = torch.prim.ListConstruct %int1_25, %564, %int8, %int128_26 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %594 = torch.aten.view %580, %593 : !torch.vtensor<[1,?,1024],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %594, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %int1_27 = torch.constant.int 1
    %int8_28 = torch.constant.int 8
    %int128_29 = torch.constant.int 128
    %595 = torch.prim.ListConstruct %int1_27, %564, %int8_28, %int128_29 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %596 = torch.aten.view %590, %595 : !torch.vtensor<[1,?,1024],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %596, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %int131072 = torch.constant.int 131072
    %none_30 = torch.constant.none
    %none_31 = torch.constant.none
    %cpu = torch.constant.device "cpu"
    %false_32 = torch.constant.bool false
    %597 = torch.aten.arange %int131072, %none_30, %none_31, %cpu, %false_32 : !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[131072],si64>
    %int0 = torch.constant.int 0
    %int128_33 = torch.constant.int 128
    %int2_34 = torch.constant.int 2
    %int4 = torch.constant.int 4
    %none_35 = torch.constant.none
    %cpu_36 = torch.constant.device "cpu"
    %false_37 = torch.constant.bool false
    %598 = torch.aten.arange.start_step %int0, %int128_33, %int2_34, %int4, %none_35, %cpu_36, %false_37 : !torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[64],si64>
    %int6_38 = torch.constant.int 6
    %599 = torch.prims.convert_element_type %598, %int6_38 : !torch.vtensor<[64],si64>, !torch.int -> !torch.vtensor<[64],f32>
    %int128_39 = torch.constant.int 128
    %600 = torch.aten.div.Scalar %599, %int128_39 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %float5.000000e05 = torch.constant.float 5.000000e+05
    %601 = torch.aten.pow.Scalar %float5.000000e05, %600 : !torch.float, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %602 = torch.aten.reciprocal %601 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %float1.000000e00 = torch.constant.float 1.000000e+00
    %603 = torch.aten.mul.Scalar %602, %float1.000000e00 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],f32>
    %604 = torch.aten.reciprocal %603 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %float6.283190e00 = torch.constant.float 6.2831853071795862
    %605 = torch.aten.mul.Scalar %604, %float6.283190e00 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],f32>
    %float8.192000e03 = torch.constant.float 8.192000e+03
    %606 = torch.aten.gt.Scalar %605, %float8.192000e03 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1>
    %int8_40 = torch.constant.int 8
    %607 = torch.aten.div.Scalar %603, %int8_40 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %608 = torch.aten.where.self %606, %607, %603 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %609 = torch.aten.reciprocal %605 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %int8192 = torch.constant.int 8192
    %610 = torch.aten.mul.Scalar %609, %int8192 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %int1_41 = torch.constant.int 1
    %int1_42 = torch.constant.int 1
    %611 = torch.aten.sub.Scalar %610, %int1_41, %int1_42 : !torch.vtensor<[64],f32>, !torch.int, !torch.int -> !torch.vtensor<[64],f32>
    %int3 = torch.constant.int 3
    %612 = torch.aten.div.Scalar %611, %int3 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %int1_43 = torch.constant.int 1
    %int1_44 = torch.constant.int 1
    %613 = torch.aten.rsub.Scalar %612, %int1_43, %int1_44 : !torch.vtensor<[64],f32>, !torch.int, !torch.int -> !torch.vtensor<[64],f32>
    %614 = torch.aten.mul.Tensor %613, %608 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %int8_45 = torch.constant.int 8
    %615 = torch.aten.div.Scalar %614, %int8_45 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %616 = torch.aten.mul.Tensor %612, %608 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %int1_46 = torch.constant.int 1
    %617 = torch.aten.add.Tensor %615, %616, %int1_46 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %float2.048000e03 = torch.constant.float 2.048000e+03
    %618 = torch.aten.lt.Scalar %605, %float2.048000e03 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1>
    %619 = torch.aten.bitwise_not %618 : !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1>
    %float8.192000e03_47 = torch.constant.float 8.192000e+03
    %620 = torch.aten.gt.Scalar %605, %float8.192000e03_47 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1>
    %621 = torch.aten.bitwise_not %620 : !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1>
    %622 = torch.aten.mul.Tensor %619, %621 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1>
    %623 = torch.aten.where.self %622, %617, %608 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %624 = torch.prim.ListConstruct %623, %623 : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor>
    %int-1_48 = torch.constant.int -1
    %625 = torch.aten.cat %624, %int-1_48 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[128],f32>
    %int6_49 = torch.constant.int 6
    %626 = torch.prims.convert_element_type %597, %int6_49 : !torch.vtensor<[131072],si64>, !torch.int -> !torch.vtensor<[131072],f32>
    %int131072_50 = torch.constant.int 131072
    %int1_51 = torch.constant.int 1
    %627 = torch.prim.ListConstruct %int131072_50, %int1_51 : (!torch.int, !torch.int) -> !torch.list<int>
    %628 = torch.aten.view %626, %627 : !torch.vtensor<[131072],f32>, !torch.list<int> -> !torch.vtensor<[131072,1],f32>
    %629 = torch.aten.mul.Tensor %628, %625 : !torch.vtensor<[131072,1],f32>, !torch.vtensor<[128],f32> -> !torch.vtensor<[131072,128],f32>
    %630 = torch.aten.cos %629 : !torch.vtensor<[131072,128],f32> -> !torch.vtensor<[131072,128],f32>
    %int15_52 = torch.constant.int 15
    %631 = torch.prims.convert_element_type %630, %int15_52 : !torch.vtensor<[131072,128],f32>, !torch.int -> !torch.vtensor<[131072,128],bf16>
    %632 = torch.aten.sin %629 : !torch.vtensor<[131072,128],f32> -> !torch.vtensor<[131072,128],f32>
    %int15_53 = torch.constant.int 15
    %633 = torch.prims.convert_element_type %632, %int15_53 : !torch.vtensor<[131072,128],f32>, !torch.int -> !torch.vtensor<[131072,128],bf16>
    %int1_54 = torch.constant.int 1
    %634 = torch.aten.size.int %569, %int1_54 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.int
    %int0_55 = torch.constant.int 0
    %635 = torch.aten.add.int %int0_55, %634 : !torch.int, !torch.int -> !torch.int
    %int0_56 = torch.constant.int 0
    %int0_57 = torch.constant.int 0
    %int1_58 = torch.constant.int 1
    %636 = torch.aten.slice.Tensor %631, %int0_56, %int0_57, %635, %int1_58 : !torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %636, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int1_59 = torch.constant.int 1
    %int0_60 = torch.constant.int 0
    %int9223372036854775807 = torch.constant.int 9223372036854775807
    %int1_61 = torch.constant.int 1
    %637 = torch.aten.slice.Tensor %636, %int1_59, %int0_60, %int9223372036854775807, %int1_61 : !torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %637, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int0_62 = torch.constant.int 0
    %638 = torch.aten.add.int %int0_62, %634 : !torch.int, !torch.int -> !torch.int
    %int0_63 = torch.constant.int 0
    %int0_64 = torch.constant.int 0
    %int1_65 = torch.constant.int 1
    %639 = torch.aten.slice.Tensor %633, %int0_63, %int0_64, %638, %int1_65 : !torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %639, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int1_66 = torch.constant.int 1
    %int0_67 = torch.constant.int 0
    %int9223372036854775807_68 = torch.constant.int 9223372036854775807
    %int1_69 = torch.constant.int 1
    %640 = torch.aten.slice.Tensor %639, %int1_66, %int0_67, %int9223372036854775807_68, %int1_69 : !torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %640, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int0_70 = torch.constant.int 0
    %641 = torch.aten.unsqueeze %637, %int0_70 : !torch.vtensor<[?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %641, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int1_71 = torch.constant.int 1
    %int0_72 = torch.constant.int 0
    %int9223372036854775807_73 = torch.constant.int 9223372036854775807
    %int1_74 = torch.constant.int 1
    %642 = torch.aten.slice.Tensor %641, %int1_71, %int0_72, %int9223372036854775807_73, %int1_74 : !torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %642, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int2_75 = torch.constant.int 2
    %643 = torch.aten.unsqueeze %642, %int2_75 : !torch.vtensor<[1,?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %643, [%548], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int3_76 = torch.constant.int 3
    %int0_77 = torch.constant.int 0
    %int9223372036854775807_78 = torch.constant.int 9223372036854775807
    %int1_79 = torch.constant.int 1
    %644 = torch.aten.slice.Tensor %643, %int3_76, %int0_77, %int9223372036854775807_78, %int1_79 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %644, [%548], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int0_80 = torch.constant.int 0
    %645 = torch.aten.unsqueeze %640, %int0_80 : !torch.vtensor<[?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %645, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int1_81 = torch.constant.int 1
    %int0_82 = torch.constant.int 0
    %int9223372036854775807_83 = torch.constant.int 9223372036854775807
    %int1_84 = torch.constant.int 1
    %646 = torch.aten.slice.Tensor %645, %int1_81, %int0_82, %int9223372036854775807_83, %int1_84 : !torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %646, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int2_85 = torch.constant.int 2
    %647 = torch.aten.unsqueeze %646, %int2_85 : !torch.vtensor<[1,?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %647, [%548], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int3_86 = torch.constant.int 3
    %int0_87 = torch.constant.int 0
    %int9223372036854775807_88 = torch.constant.int 9223372036854775807
    %int1_89 = torch.constant.int 1
    %648 = torch.aten.slice.Tensor %647, %int3_86, %int0_87, %int9223372036854775807_88, %int1_89 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %648, [%548], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int1_90 = torch.constant.int 1
    %int2_91 = torch.constant.int 2
    %649 = torch.aten.transpose.int %644, %int1_90, %int2_91 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,?,128],bf16>
    torch.bind_symbolic_shape %649, [%548], affine_map<()[s0] -> (1, 1, s0 * 32, 128)> : !torch.vtensor<[1,1,?,128],bf16>
    %int1_92 = torch.constant.int 1
    %int1_93 = torch.constant.int 1
    %int1_94 = torch.constant.int 1
    %int1_95 = torch.constant.int 1
    %650 = torch.prim.ListConstruct %int1_92, %int1_93, %int1_94, %int1_95 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %651 = torch.aten.repeat %649, %650 : !torch.vtensor<[1,1,?,128],bf16>, !torch.list<int> -> !torch.vtensor<[1,1,?,128],bf16>
    torch.bind_symbolic_shape %651, [%548], affine_map<()[s0] -> (1, 1, s0 * 32, 128)> : !torch.vtensor<[1,1,?,128],bf16>
    %int1_96 = torch.constant.int 1
    %int2_97 = torch.constant.int 2
    %652 = torch.aten.transpose.int %648, %int1_96, %int2_97 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,?,128],bf16>
    torch.bind_symbolic_shape %652, [%548], affine_map<()[s0] -> (1, 1, s0 * 32, 128)> : !torch.vtensor<[1,1,?,128],bf16>
    %int1_98 = torch.constant.int 1
    %int2_99 = torch.constant.int 2
    %653 = torch.aten.transpose.int %592, %int1_98, %int2_99 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
    torch.bind_symbolic_shape %653, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %int1_100 = torch.constant.int 1
    %int1_101 = torch.constant.int 1
    %int1_102 = torch.constant.int 1
    %int1_103 = torch.constant.int 1
    %654 = torch.prim.ListConstruct %int1_100, %int1_101, %int1_102, %int1_103 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %655 = torch.aten.repeat %652, %654 : !torch.vtensor<[1,1,?,128],bf16>, !torch.list<int> -> !torch.vtensor<[1,1,?,128],bf16>
    torch.bind_symbolic_shape %655, [%548], affine_map<()[s0] -> (1, 1, s0 * 32, 128)> : !torch.vtensor<[1,1,?,128],bf16>
    %656 = torch.aten.mul.Tensor %653, %651 : !torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,1,?,128],bf16> -> !torch.vtensor<[1,32,?,128],bf16>
    torch.bind_symbolic_shape %656, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %int3_104 = torch.constant.int 3
    %int0_105 = torch.constant.int 0
    %int64 = torch.constant.int 64
    %int1_106 = torch.constant.int 1
    %657 = torch.aten.slice.Tensor %653, %int3_104, %int0_105, %int64, %int1_106 : !torch.vtensor<[1,32,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,64],bf16>
    torch.bind_symbolic_shape %657, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 64)> : !torch.vtensor<[1,32,?,64],bf16>
    %int3_107 = torch.constant.int 3
    %int64_108 = torch.constant.int 64
    %int9223372036854775807_109 = torch.constant.int 9223372036854775807
    %int1_110 = torch.constant.int 1
    %658 = torch.aten.slice.Tensor %653, %int3_107, %int64_108, %int9223372036854775807_109, %int1_110 : !torch.vtensor<[1,32,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,64],bf16>
    torch.bind_symbolic_shape %658, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 64)> : !torch.vtensor<[1,32,?,64],bf16>
    %659 = torch.aten.neg %658 : !torch.vtensor<[1,32,?,64],bf16> -> !torch.vtensor<[1,32,?,64],bf16>
    torch.bind_symbolic_shape %659, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 64)> : !torch.vtensor<[1,32,?,64],bf16>
    %660 = torch.prim.ListConstruct %659, %657 : (!torch.vtensor<[1,32,?,64],bf16>, !torch.vtensor<[1,32,?,64],bf16>) -> !torch.list<vtensor>
    %int-1_111 = torch.constant.int -1
    %661 = torch.aten.cat %660, %int-1_111 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
    torch.bind_symbolic_shape %661, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %662 = torch.aten.mul.Tensor %661, %655 : !torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,1,?,128],bf16> -> !torch.vtensor<[1,32,?,128],bf16>
    torch.bind_symbolic_shape %662, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %int1_112 = torch.constant.int 1
    %663 = torch.aten.add.Tensor %656, %662, %int1_112 : !torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,32,?,128],bf16>, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
    torch.bind_symbolic_shape %663, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %int1_113 = torch.constant.int 1
    %int2_114 = torch.constant.int 2
    %664 = torch.aten.transpose.int %663, %int1_113, %int2_114 : !torch.vtensor<[1,32,?,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
    torch.bind_symbolic_shape %664, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
    %int131072_115 = torch.constant.int 131072
    %none_116 = torch.constant.none
    %none_117 = torch.constant.none
    %cpu_118 = torch.constant.device "cpu"
    %false_119 = torch.constant.bool false
    %665 = torch.aten.arange %int131072_115, %none_116, %none_117, %cpu_118, %false_119 : !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[131072],si64>
    %int0_120 = torch.constant.int 0
    %int128_121 = torch.constant.int 128
    %int2_122 = torch.constant.int 2
    %int4_123 = torch.constant.int 4
    %none_124 = torch.constant.none
    %cpu_125 = torch.constant.device "cpu"
    %false_126 = torch.constant.bool false
    %666 = torch.aten.arange.start_step %int0_120, %int128_121, %int2_122, %int4_123, %none_124, %cpu_125, %false_126 : !torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[64],si64>
    %int6_127 = torch.constant.int 6
    %667 = torch.prims.convert_element_type %666, %int6_127 : !torch.vtensor<[64],si64>, !torch.int -> !torch.vtensor<[64],f32>
    %int128_128 = torch.constant.int 128
    %668 = torch.aten.div.Scalar %667, %int128_128 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %float5.000000e05_129 = torch.constant.float 5.000000e+05
    %669 = torch.aten.pow.Scalar %float5.000000e05_129, %668 : !torch.float, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %670 = torch.aten.reciprocal %669 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %float1.000000e00_130 = torch.constant.float 1.000000e+00
    %671 = torch.aten.mul.Scalar %670, %float1.000000e00_130 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],f32>
    %672 = torch.aten.reciprocal %671 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %float6.283190e00_131 = torch.constant.float 6.2831853071795862
    %673 = torch.aten.mul.Scalar %672, %float6.283190e00_131 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],f32>
    %float8.192000e03_132 = torch.constant.float 8.192000e+03
    %674 = torch.aten.gt.Scalar %673, %float8.192000e03_132 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1>
    %int8_133 = torch.constant.int 8
    %675 = torch.aten.div.Scalar %671, %int8_133 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %676 = torch.aten.where.self %674, %675, %671 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %677 = torch.aten.reciprocal %673 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %int8192_134 = torch.constant.int 8192
    %678 = torch.aten.mul.Scalar %677, %int8192_134 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %int1_135 = torch.constant.int 1
    %int1_136 = torch.constant.int 1
    %679 = torch.aten.sub.Scalar %678, %int1_135, %int1_136 : !torch.vtensor<[64],f32>, !torch.int, !torch.int -> !torch.vtensor<[64],f32>
    %int3_137 = torch.constant.int 3
    %680 = torch.aten.div.Scalar %679, %int3_137 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %int1_138 = torch.constant.int 1
    %int1_139 = torch.constant.int 1
    %681 = torch.aten.rsub.Scalar %680, %int1_138, %int1_139 : !torch.vtensor<[64],f32>, !torch.int, !torch.int -> !torch.vtensor<[64],f32>
    %682 = torch.aten.mul.Tensor %681, %676 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %int8_140 = torch.constant.int 8
    %683 = torch.aten.div.Scalar %682, %int8_140 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %684 = torch.aten.mul.Tensor %680, %676 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %int1_141 = torch.constant.int 1
    %685 = torch.aten.add.Tensor %683, %684, %int1_141 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %float2.048000e03_142 = torch.constant.float 2.048000e+03
    %686 = torch.aten.lt.Scalar %673, %float2.048000e03_142 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1>
    %687 = torch.aten.bitwise_not %686 : !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1>
    %float8.192000e03_143 = torch.constant.float 8.192000e+03
    %688 = torch.aten.gt.Scalar %673, %float8.192000e03_143 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1>
    %689 = torch.aten.bitwise_not %688 : !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1>
    %690 = torch.aten.mul.Tensor %687, %689 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1>
    %691 = torch.aten.where.self %690, %685, %676 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %692 = torch.prim.ListConstruct %691, %691 : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor>
    %int-1_144 = torch.constant.int -1
    %693 = torch.aten.cat %692, %int-1_144 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[128],f32>
    %int6_145 = torch.constant.int 6
    %694 = torch.prims.convert_element_type %665, %int6_145 : !torch.vtensor<[131072],si64>, !torch.int -> !torch.vtensor<[131072],f32>
    %int131072_146 = torch.constant.int 131072
    %int1_147 = torch.constant.int 1
    %695 = torch.prim.ListConstruct %int131072_146, %int1_147 : (!torch.int, !torch.int) -> !torch.list<int>
    %696 = torch.aten.view %694, %695 : !torch.vtensor<[131072],f32>, !torch.list<int> -> !torch.vtensor<[131072,1],f32>
    %697 = torch.aten.mul.Tensor %696, %693 : !torch.vtensor<[131072,1],f32>, !torch.vtensor<[128],f32> -> !torch.vtensor<[131072,128],f32>
    %698 = torch.aten.cos %697 : !torch.vtensor<[131072,128],f32> -> !torch.vtensor<[131072,128],f32>
    %int15_148 = torch.constant.int 15
    %699 = torch.prims.convert_element_type %698, %int15_148 : !torch.vtensor<[131072,128],f32>, !torch.int -> !torch.vtensor<[131072,128],bf16>
    %700 = torch.aten.sin %697 : !torch.vtensor<[131072,128],f32> -> !torch.vtensor<[131072,128],f32>
    %int15_149 = torch.constant.int 15
    %701 = torch.prims.convert_element_type %700, %int15_149 : !torch.vtensor<[131072,128],f32>, !torch.int -> !torch.vtensor<[131072,128],bf16>
    %int1_150 = torch.constant.int 1
    %702 = torch.aten.size.int %579, %int1_150 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.int
    %int0_151 = torch.constant.int 0
    %703 = torch.aten.add.int %int0_151, %702 : !torch.int, !torch.int -> !torch.int
    %int0_152 = torch.constant.int 0
    %int0_153 = torch.constant.int 0
    %int1_154 = torch.constant.int 1
    %704 = torch.aten.slice.Tensor %699, %int0_152, %int0_153, %703, %int1_154 : !torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %704, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int1_155 = torch.constant.int 1
    %int0_156 = torch.constant.int 0
    %int9223372036854775807_157 = torch.constant.int 9223372036854775807
    %int1_158 = torch.constant.int 1
    %705 = torch.aten.slice.Tensor %704, %int1_155, %int0_156, %int9223372036854775807_157, %int1_158 : !torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %705, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int0_159 = torch.constant.int 0
    %706 = torch.aten.add.int %int0_159, %702 : !torch.int, !torch.int -> !torch.int
    %int0_160 = torch.constant.int 0
    %int0_161 = torch.constant.int 0
    %int1_162 = torch.constant.int 1
    %707 = torch.aten.slice.Tensor %701, %int0_160, %int0_161, %706, %int1_162 : !torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %707, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int1_163 = torch.constant.int 1
    %int0_164 = torch.constant.int 0
    %int9223372036854775807_165 = torch.constant.int 9223372036854775807
    %int1_166 = torch.constant.int 1
    %708 = torch.aten.slice.Tensor %707, %int1_163, %int0_164, %int9223372036854775807_165, %int1_166 : !torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %708, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int0_167 = torch.constant.int 0
    %709 = torch.aten.unsqueeze %705, %int0_167 : !torch.vtensor<[?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %709, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int1_168 = torch.constant.int 1
    %int0_169 = torch.constant.int 0
    %int9223372036854775807_170 = torch.constant.int 9223372036854775807
    %int1_171 = torch.constant.int 1
    %710 = torch.aten.slice.Tensor %709, %int1_168, %int0_169, %int9223372036854775807_170, %int1_171 : !torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %710, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int2_172 = torch.constant.int 2
    %711 = torch.aten.unsqueeze %710, %int2_172 : !torch.vtensor<[1,?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %711, [%548], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int3_173 = torch.constant.int 3
    %int0_174 = torch.constant.int 0
    %int9223372036854775807_175 = torch.constant.int 9223372036854775807
    %int1_176 = torch.constant.int 1
    %712 = torch.aten.slice.Tensor %711, %int3_173, %int0_174, %int9223372036854775807_175, %int1_176 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %712, [%548], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int0_177 = torch.constant.int 0
    %713 = torch.aten.unsqueeze %708, %int0_177 : !torch.vtensor<[?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %713, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int1_178 = torch.constant.int 1
    %int0_179 = torch.constant.int 0
    %int9223372036854775807_180 = torch.constant.int 9223372036854775807
    %int1_181 = torch.constant.int 1
    %714 = torch.aten.slice.Tensor %713, %int1_178, %int0_179, %int9223372036854775807_180, %int1_181 : !torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %714, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int2_182 = torch.constant.int 2
    %715 = torch.aten.unsqueeze %714, %int2_182 : !torch.vtensor<[1,?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %715, [%548], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int3_183 = torch.constant.int 3
    %int0_184 = torch.constant.int 0
    %int9223372036854775807_185 = torch.constant.int 9223372036854775807
    %int1_186 = torch.constant.int 1
    %716 = torch.aten.slice.Tensor %715, %int3_183, %int0_184, %int9223372036854775807_185, %int1_186 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %716, [%548], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int1_187 = torch.constant.int 1
    %int2_188 = torch.constant.int 2
    %717 = torch.aten.transpose.int %712, %int1_187, %int2_188 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,?,128],bf16>
    torch.bind_symbolic_shape %717, [%548], affine_map<()[s0] -> (1, 1, s0 * 32, 128)> : !torch.vtensor<[1,1,?,128],bf16>
    %int1_189 = torch.constant.int 1
    %int1_190 = torch.constant.int 1
    %int1_191 = torch.constant.int 1
    %int1_192 = torch.constant.int 1
    %718 = torch.prim.ListConstruct %int1_189, %int1_190, %int1_191, %int1_192 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %719 = torch.aten.repeat %717, %718 : !torch.vtensor<[1,1,?,128],bf16>, !torch.list<int> -> !torch.vtensor<[1,1,?,128],bf16>
    torch.bind_symbolic_shape %719, [%548], affine_map<()[s0] -> (1, 1, s0 * 32, 128)> : !torch.vtensor<[1,1,?,128],bf16>
    %int1_193 = torch.constant.int 1
    %int2_194 = torch.constant.int 2
    %720 = torch.aten.transpose.int %716, %int1_193, %int2_194 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,?,128],bf16>
    torch.bind_symbolic_shape %720, [%548], affine_map<()[s0] -> (1, 1, s0 * 32, 128)> : !torch.vtensor<[1,1,?,128],bf16>
    %int1_195 = torch.constant.int 1
    %int2_196 = torch.constant.int 2
    %721 = torch.aten.transpose.int %594, %int1_195, %int2_196 : !torch.vtensor<[1,?,8,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,8,?,128],bf16>
    torch.bind_symbolic_shape %721, [%548], affine_map<()[s0] -> (1, 8, s0 * 32, 128)> : !torch.vtensor<[1,8,?,128],bf16>
    %int1_197 = torch.constant.int 1
    %int1_198 = torch.constant.int 1
    %int1_199 = torch.constant.int 1
    %int1_200 = torch.constant.int 1
    %722 = torch.prim.ListConstruct %int1_197, %int1_198, %int1_199, %int1_200 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %723 = torch.aten.repeat %720, %722 : !torch.vtensor<[1,1,?,128],bf16>, !torch.list<int> -> !torch.vtensor<[1,1,?,128],bf16>
    torch.bind_symbolic_shape %723, [%548], affine_map<()[s0] -> (1, 1, s0 * 32, 128)> : !torch.vtensor<[1,1,?,128],bf16>
    %724 = torch.aten.mul.Tensor %721, %719 : !torch.vtensor<[1,8,?,128],bf16>, !torch.vtensor<[1,1,?,128],bf16> -> !torch.vtensor<[1,8,?,128],bf16>
    torch.bind_symbolic_shape %724, [%548], affine_map<()[s0] -> (1, 8, s0 * 32, 128)> : !torch.vtensor<[1,8,?,128],bf16>
    %int3_201 = torch.constant.int 3
    %int0_202 = torch.constant.int 0
    %int64_203 = torch.constant.int 64
    %int1_204 = torch.constant.int 1
    %725 = torch.aten.slice.Tensor %721, %int3_201, %int0_202, %int64_203, %int1_204 : !torch.vtensor<[1,8,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,8,?,64],bf16>
    torch.bind_symbolic_shape %725, [%548], affine_map<()[s0] -> (1, 8, s0 * 32, 64)> : !torch.vtensor<[1,8,?,64],bf16>
    %int3_205 = torch.constant.int 3
    %int64_206 = torch.constant.int 64
    %int9223372036854775807_207 = torch.constant.int 9223372036854775807
    %int1_208 = torch.constant.int 1
    %726 = torch.aten.slice.Tensor %721, %int3_205, %int64_206, %int9223372036854775807_207, %int1_208 : !torch.vtensor<[1,8,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,8,?,64],bf16>
    torch.bind_symbolic_shape %726, [%548], affine_map<()[s0] -> (1, 8, s0 * 32, 64)> : !torch.vtensor<[1,8,?,64],bf16>
    %727 = torch.aten.neg %726 : !torch.vtensor<[1,8,?,64],bf16> -> !torch.vtensor<[1,8,?,64],bf16>
    torch.bind_symbolic_shape %727, [%548], affine_map<()[s0] -> (1, 8, s0 * 32, 64)> : !torch.vtensor<[1,8,?,64],bf16>
    %728 = torch.prim.ListConstruct %727, %725 : (!torch.vtensor<[1,8,?,64],bf16>, !torch.vtensor<[1,8,?,64],bf16>) -> !torch.list<vtensor>
    %int-1_209 = torch.constant.int -1
    %729 = torch.aten.cat %728, %int-1_209 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,8,?,128],bf16>
    torch.bind_symbolic_shape %729, [%548], affine_map<()[s0] -> (1, 8, s0 * 32, 128)> : !torch.vtensor<[1,8,?,128],bf16>
    %730 = torch.aten.mul.Tensor %729, %723 : !torch.vtensor<[1,8,?,128],bf16>, !torch.vtensor<[1,1,?,128],bf16> -> !torch.vtensor<[1,8,?,128],bf16>
    torch.bind_symbolic_shape %730, [%548], affine_map<()[s0] -> (1, 8, s0 * 32, 128)> : !torch.vtensor<[1,8,?,128],bf16>
    %int1_210 = torch.constant.int 1
    %731 = torch.aten.add.Tensor %724, %730, %int1_210 : !torch.vtensor<[1,8,?,128],bf16>, !torch.vtensor<[1,8,?,128],bf16>, !torch.int -> !torch.vtensor<[1,8,?,128],bf16>
    torch.bind_symbolic_shape %731, [%548], affine_map<()[s0] -> (1, 8, s0 * 32, 128)> : !torch.vtensor<[1,8,?,128],bf16>
    %int1_211 = torch.constant.int 1
    %int2_212 = torch.constant.int 2
    %732 = torch.aten.transpose.int %731, %int1_211, %int2_212 : !torch.vtensor<[1,8,?,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %732, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %733 = torch.aten.div.Tensor %732, %8 : !torch.vtensor<[1,?,8,128],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %733, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %float-2.400000e02_213 = torch.constant.float -2.400000e+02
    %float2.400000e02_214 = torch.constant.float 2.400000e+02
    %734 = torch.aten.clamp %733, %float-2.400000e02_213, %float2.400000e02_214 : !torch.vtensor<[1,?,8,128],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %734, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %int26_215 = torch.constant.int 26
    %735 = torch.prims.convert_element_type %734, %int26_215 : !torch.vtensor<[1,?,8,128],bf16>, !torch.int -> !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %735, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
    %736 = torch.aten.div.Tensor %596, %8 : !torch.vtensor<[1,?,8,128],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %736, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %float-2.400000e02_216 = torch.constant.float -2.400000e+02
    %float2.400000e02_217 = torch.constant.float 2.400000e+02
    %737 = torch.aten.clamp %736, %float-2.400000e02_216, %float2.400000e02_217 : !torch.vtensor<[1,?,8,128],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %737, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %int26_218 = torch.constant.int 26
    %738 = torch.prims.convert_element_type %737, %int26_218 : !torch.vtensor<[1,?,8,128],bf16>, !torch.int -> !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %738, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
    %int0_219 = torch.constant.int 0
    %739 = torch.aten.size.int %547, %int0_219 : !torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int -> !torch.int
    %int32_220 = torch.constant.int 32
    %int2_221 = torch.constant.int 2
    %int32_222 = torch.constant.int 32
    %int8_223 = torch.constant.int 8
    %int128_224 = torch.constant.int 128
    %740 = torch.prim.ListConstruct %739, %int32_220, %int2_221, %int32_222, %int8_223, %int128_224 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %741 = torch.aten.view %547, %740 : !torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %741, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    %int32_225 = torch.constant.int 32
    %742 = torch.aten.mul.int %739, %int32_225 : !torch.int, !torch.int -> !torch.int
    %int2_226 = torch.constant.int 2
    %743 = torch.aten.mul.int %742, %int2_226 : !torch.int, !torch.int -> !torch.int
    %int32_227 = torch.constant.int 32
    %int8_228 = torch.constant.int 8
    %int128_229 = torch.constant.int 128
    %744 = torch.prim.ListConstruct %743, %int32_227, %int8_228, %int128_229 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %745 = torch.aten.view %741, %744 : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %745, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %int64_230 = torch.constant.int 64
    %746 = torch.aten.mul.Scalar %arg2, %int64_230 : !torch.vtensor<[1,?],si64>, !torch.int -> !torch.vtensor<[1,?],si64>
    torch.bind_symbolic_shape %746, [%548], affine_map<()[s0] -> (1, s0)> : !torch.vtensor<[1,?],si64>
    %int0_231 = torch.constant.int 0
    %int1_232 = torch.constant.int 1
    %747 = torch.aten.add.Scalar %746, %int0_231, %int1_232 : !torch.vtensor<[1,?],si64>, !torch.int, !torch.int -> !torch.vtensor<[1,?],si64>
    torch.bind_symbolic_shape %747, [%548], affine_map<()[s0] -> (1, s0)> : !torch.vtensor<[1,?],si64>
    %int1_233 = torch.constant.int 1
    %748 = torch.aten.size.int %arg2, %int1_233 : !torch.vtensor<[1,?],si64>, !torch.int -> !torch.int
    %int1_234 = torch.constant.int 1
    %int32_235 = torch.constant.int 32
    %int8_236 = torch.constant.int 8
    %int128_237 = torch.constant.int 128
    %749 = torch.prim.ListConstruct %int1_234, %748, %int32_235, %int8_236, %int128_237 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %750 = torch.aten.view %735, %749 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %750, [%548], affine_map<()[s0] -> (1, s0, 32, 8, 128)> : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
    %int32_238 = torch.constant.int 32
    %int8_239 = torch.constant.int 8
    %int128_240 = torch.constant.int 128
    %751 = torch.prim.ListConstruct %748, %int32_238, %int8_239, %int128_240 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %752 = torch.aten.view %750, %751 : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %752, [%548], affine_map<()[s0] -> (s0, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %753 = torch.prim.ListConstruct %748 : (!torch.int) -> !torch.list<int>
    %754 = torch.aten.view %747, %753 : !torch.vtensor<[1,?],si64>, !torch.list<int> -> !torch.vtensor<[?],si64>
    torch.bind_symbolic_shape %754, [%548], affine_map<()[s0] -> (s0)> : !torch.vtensor<[?],si64>
    %755 = torch.prim.ListConstruct %754 : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %false_241 = torch.constant.bool false
    %756 = torch.aten.index_put %745, %755, %752, %false_241 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %756, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %int32_242 = torch.constant.int 32
    %int2_243 = torch.constant.int 2
    %int32_244 = torch.constant.int 32
    %int8_245 = torch.constant.int 8
    %int128_246 = torch.constant.int 128
    %757 = torch.prim.ListConstruct %739, %int32_242, %int2_243, %int32_244, %int8_245, %int128_246 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %758 = torch.aten.view %756, %757 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %758, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    %int2097152 = torch.constant.int 2097152
    %759 = torch.prim.ListConstruct %739, %int2097152 : (!torch.int, !torch.int) -> !torch.list<int>
    %760 = torch.aten.view %758, %759 : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    torch.bind_symbolic_shape %760, [%549], affine_map<()[s0] -> (s0, 2097152)> : !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    %int32_247 = torch.constant.int 32
    %int2_248 = torch.constant.int 2
    %int32_249 = torch.constant.int 32
    %int8_250 = torch.constant.int 8
    %int128_251 = torch.constant.int 128
    %761 = torch.prim.ListConstruct %739, %int32_247, %int2_248, %int32_249, %int8_250, %int128_251 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %762 = torch.aten.view %760, %761 : !torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %762, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    %int32_252 = torch.constant.int 32
    %int8_253 = torch.constant.int 8
    %int128_254 = torch.constant.int 128
    %763 = torch.prim.ListConstruct %743, %int32_252, %int8_253, %int128_254 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %764 = torch.aten.view %762, %763 : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %764, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %int1_255 = torch.constant.int 1
    %int32_256 = torch.constant.int 32
    %int8_257 = torch.constant.int 8
    %int128_258 = torch.constant.int 128
    %765 = torch.prim.ListConstruct %int1_255, %748, %int32_256, %int8_257, %int128_258 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %766 = torch.aten.view %738, %765 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %766, [%548], affine_map<()[s0] -> (1, s0, 32, 8, 128)> : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
    %int32_259 = torch.constant.int 32
    %int8_260 = torch.constant.int 8
    %int128_261 = torch.constant.int 128
    %767 = torch.prim.ListConstruct %748, %int32_259, %int8_260, %int128_261 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %768 = torch.aten.view %766, %767 : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %768, [%548], affine_map<()[s0] -> (s0, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %int1_262 = torch.constant.int 1
    %int1_263 = torch.constant.int 1
    %769 = torch.aten.add.Scalar %747, %int1_262, %int1_263 : !torch.vtensor<[1,?],si64>, !torch.int, !torch.int -> !torch.vtensor<[1,?],si64>
    torch.bind_symbolic_shape %769, [%548], affine_map<()[s0] -> (1, s0)> : !torch.vtensor<[1,?],si64>
    %770 = torch.prim.ListConstruct %748 : (!torch.int) -> !torch.list<int>
    %771 = torch.aten.view %769, %770 : !torch.vtensor<[1,?],si64>, !torch.list<int> -> !torch.vtensor<[?],si64>
    torch.bind_symbolic_shape %771, [%548], affine_map<()[s0] -> (s0)> : !torch.vtensor<[?],si64>
    %772 = torch.prim.ListConstruct %771 : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %false_264 = torch.constant.bool false
    %773 = torch.aten.index_put %764, %772, %768, %false_264 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %773, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %int32_265 = torch.constant.int 32
    %int2_266 = torch.constant.int 2
    %int32_267 = torch.constant.int 32
    %int8_268 = torch.constant.int 8
    %int128_269 = torch.constant.int 128
    %774 = torch.prim.ListConstruct %739, %int32_265, %int2_266, %int32_267, %int8_268, %int128_269 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %775 = torch.aten.view %773, %774 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %775, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    %int2097152_270 = torch.constant.int 2097152
    %776 = torch.prim.ListConstruct %739, %int2097152_270 : (!torch.int, !torch.int) -> !torch.list<int>
    %777 = torch.aten.view %775, %776 : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    torch.bind_symbolic_shape %777, [%549], affine_map<()[s0] -> (s0, 2097152)> : !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    %int-2_271 = torch.constant.int -2
    %778 = torch.aten.unsqueeze %735, %int-2_271 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %778, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 1, 128)> : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
    %int1_272 = torch.constant.int 1
    %int8_273 = torch.constant.int 8
    %int4_274 = torch.constant.int 4
    %int128_275 = torch.constant.int 128
    %779 = torch.prim.ListConstruct %int1_272, %702, %int8_273, %int4_274, %int128_275 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %false_276 = torch.constant.bool false
    %780 = torch.aten.expand %778, %779, %false_276 : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %780, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    %int0_277 = torch.constant.int 0
    %781 = torch.aten.clone %780, %int0_277 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %781, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    %int1_278 = torch.constant.int 1
    %int32_279 = torch.constant.int 32
    %int128_280 = torch.constant.int 128
    %782 = torch.prim.ListConstruct %int1_278, %702, %int32_279, %int128_280 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %783 = torch.aten._unsafe_view %781, %782 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %783, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
    %int-2_281 = torch.constant.int -2
    %784 = torch.aten.unsqueeze %738, %int-2_281 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %784, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 1, 128)> : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
    %int1_282 = torch.constant.int 1
    %785 = torch.aten.size.int %589, %int1_282 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.int
    %int1_283 = torch.constant.int 1
    %int8_284 = torch.constant.int 8
    %int4_285 = torch.constant.int 4
    %int128_286 = torch.constant.int 128
    %786 = torch.prim.ListConstruct %int1_283, %785, %int8_284, %int4_285, %int128_286 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %false_287 = torch.constant.bool false
    %787 = torch.aten.expand %784, %786, %false_287 : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %787, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    %int0_288 = torch.constant.int 0
    %788 = torch.aten.clone %787, %int0_288 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %788, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    %int1_289 = torch.constant.int 1
    %int32_290 = torch.constant.int 32
    %int128_291 = torch.constant.int 128
    %789 = torch.prim.ListConstruct %int1_289, %785, %int32_290, %int128_291 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %790 = torch.aten._unsafe_view %788, %789 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %790, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
    %int6_292 = torch.constant.int 6
    %791 = torch.prims.convert_element_type %783, %int6_292 : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,32,128],f32>
    torch.bind_symbolic_shape %791, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
    %792 = torch.aten.mul.Tensor %791, %8 : !torch.vtensor<[1,?,32,128],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,32,128],f32>
    torch.bind_symbolic_shape %792, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
    %int15_293 = torch.constant.int 15
    %793 = torch.prims.convert_element_type %792, %int15_293 : !torch.vtensor<[1,?,32,128],f32>, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
    torch.bind_symbolic_shape %793, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
    %int6_294 = torch.constant.int 6
    %794 = torch.prims.convert_element_type %790, %int6_294 : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,32,128],f32>
    torch.bind_symbolic_shape %794, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
    %795 = torch.aten.mul.Tensor %794, %8 : !torch.vtensor<[1,?,32,128],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,32,128],f32>
    torch.bind_symbolic_shape %795, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
    %int15_295 = torch.constant.int 15
    %796 = torch.prims.convert_element_type %795, %int15_295 : !torch.vtensor<[1,?,32,128],f32>, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
    torch.bind_symbolic_shape %796, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
    %int1_296 = torch.constant.int 1
    %int2_297 = torch.constant.int 2
    %797 = torch.aten.transpose.int %664, %int1_296, %int2_297 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
    torch.bind_symbolic_shape %797, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %int1_298 = torch.constant.int 1
    %int2_299 = torch.constant.int 2
    %798 = torch.aten.transpose.int %793, %int1_298, %int2_299 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
    torch.bind_symbolic_shape %798, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %int1_300 = torch.constant.int 1
    %int2_301 = torch.constant.int 2
    %799 = torch.aten.transpose.int %796, %int1_300, %int2_301 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
    torch.bind_symbolic_shape %799, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %float0.000000e00 = torch.constant.float 0.000000e+00
    %true_302 = torch.constant.bool true
    %none_303 = torch.constant.none
    %none_304 = torch.constant.none
    %800:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%797, %798, %799, %float0.000000e00, %true_302, %none_303, %none_304) : (!torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,32,?,128],bf16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,32,?],f32>) 
    torch.bind_symbolic_shape %800#0, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %int1_305 = torch.constant.int 1
    %int2_306 = torch.constant.int 2
    %801 = torch.aten.transpose.int %800#0, %int1_305, %int2_306 : !torch.vtensor<[1,32,?,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
    torch.bind_symbolic_shape %801, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
    %int1_307 = torch.constant.int 1
    %int4096_308 = torch.constant.int 4096
    %802 = torch.prim.ListConstruct %int1_307, %634, %int4096_308 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %803 = torch.aten.view %801, %802 : !torch.vtensor<[1,?,32,128],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %803, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %804 = torch.aten.div.Tensor %803, %9 : !torch.vtensor<[1,?,4096],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %804, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %float-2.400000e02_309 = torch.constant.float -2.400000e+02
    %float2.400000e02_310 = torch.constant.float 2.400000e+02
    %805 = torch.aten.clamp %804, %float-2.400000e02_309, %float2.400000e02_310 : !torch.vtensor<[1,?,4096],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %805, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int26_311 = torch.constant.int 26
    %806 = torch.prims.convert_element_type %805, %int26_311 : !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %806, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_312 = torch.constant.int -2
    %int-1_313 = torch.constant.int -1
    %807 = torch.aten.transpose.int %10, %int-2_312, %int-1_313 : !torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %int4096_314 = torch.constant.int 4096
    %808 = torch.prim.ListConstruct %634, %int4096_314 : (!torch.int, !torch.int) -> !torch.list<int>
    %809 = torch.aten.view %806, %808 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %809, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %810 = torch.aten.mm %809, %807 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %810, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %int1_315 = torch.constant.int 1
    %int4096_316 = torch.constant.int 4096
    %811 = torch.prim.ListConstruct %int1_315, %634, %int4096_316 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %812 = torch.aten.view %810, %811 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %812, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int15_317 = torch.constant.int 15
    %813 = torch.prims.convert_element_type %812, %int15_317 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %813, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int1_318 = torch.constant.int 1
    %814 = torch.aten.add.Tensor %550, %813, %int1_318 : !torch.vtensor<[1,?,4096],bf16>, !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %814, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int6_319 = torch.constant.int 6
    %815 = torch.prims.convert_element_type %814, %int6_319 : !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %815, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int2_320 = torch.constant.int 2
    %816 = torch.aten.pow.Tensor_Scalar %815, %int2_320 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %816, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int-1_321 = torch.constant.int -1
    %817 = torch.prim.ListConstruct %int-1_321 : (!torch.int) -> !torch.list<int>
    %true_322 = torch.constant.bool true
    %none_323 = torch.constant.none
    %818 = torch.aten.mean.dim %816, %817, %true_322, %none_323 : !torch.vtensor<[1,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %818, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %float1.000000e-05_324 = torch.constant.float 1.000000e-05
    %int1_325 = torch.constant.int 1
    %819 = torch.aten.add.Scalar %818, %float1.000000e-05_324, %int1_325 : !torch.vtensor<[1,?,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %819, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %820 = torch.aten.rsqrt %819 : !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %820, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %821 = torch.aten.mul.Tensor %815, %820 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %821, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int15_326 = torch.constant.int 15
    %822 = torch.prims.convert_element_type %821, %int15_326 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %822, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %823 = torch.aten.mul.Tensor %11, %822 : !torch.vtensor<[4096],bf16>, !torch.vtensor<[1,?,4096],bf16> -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %823, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %824 = torch.aten.div.Tensor %823, %12 : !torch.vtensor<[1,?,4096],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %824, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %float-2.400000e02_327 = torch.constant.float -2.400000e+02
    %float2.400000e02_328 = torch.constant.float 2.400000e+02
    %825 = torch.aten.clamp %824, %float-2.400000e02_327, %float2.400000e02_328 : !torch.vtensor<[1,?,4096],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %825, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int26_329 = torch.constant.int 26
    %826 = torch.prims.convert_element_type %825, %int26_329 : !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %826, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_330 = torch.constant.int -2
    %int-1_331 = torch.constant.int -1
    %827 = torch.aten.transpose.int %13, %int-2_330, %int-1_331 : !torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %int4096_332 = torch.constant.int 4096
    %828 = torch.prim.ListConstruct %564, %int4096_332 : (!torch.int, !torch.int) -> !torch.list<int>
    %829 = torch.aten.view %826, %828 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %829, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %830 = torch.aten.mm %829, %827 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,14336],f8E4M3FNUZ> -> !torch.vtensor<[?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %830, [%548], affine_map<()[s0] -> (s0 * 32, 14336)> : !torch.vtensor<[?,14336],f8E4M3FNUZ>
    %int1_333 = torch.constant.int 1
    %int14336 = torch.constant.int 14336
    %831 = torch.prim.ListConstruct %int1_333, %564, %int14336 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %832 = torch.aten.view %830, %831 : !torch.vtensor<[?,14336],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %832, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    %int15_334 = torch.constant.int 15
    %833 = torch.prims.convert_element_type %832, %int15_334 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %833, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %834 = torch.aten.silu %833 : !torch.vtensor<[1,?,14336],bf16> -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %834, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %835 = torch.aten.div.Tensor %823, %14 : !torch.vtensor<[1,?,4096],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %835, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %float-2.400000e02_335 = torch.constant.float -2.400000e+02
    %float2.400000e02_336 = torch.constant.float 2.400000e+02
    %836 = torch.aten.clamp %835, %float-2.400000e02_335, %float2.400000e02_336 : !torch.vtensor<[1,?,4096],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %836, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int26_337 = torch.constant.int 26
    %837 = torch.prims.convert_element_type %836, %int26_337 : !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %837, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_338 = torch.constant.int -2
    %int-1_339 = torch.constant.int -1
    %838 = torch.aten.transpose.int %15, %int-2_338, %int-1_339 : !torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %int4096_340 = torch.constant.int 4096
    %839 = torch.prim.ListConstruct %564, %int4096_340 : (!torch.int, !torch.int) -> !torch.list<int>
    %840 = torch.aten.view %837, %839 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %840, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %841 = torch.aten.mm %840, %838 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,14336],f8E4M3FNUZ> -> !torch.vtensor<[?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %841, [%548], affine_map<()[s0] -> (s0 * 32, 14336)> : !torch.vtensor<[?,14336],f8E4M3FNUZ>
    %int1_341 = torch.constant.int 1
    %int14336_342 = torch.constant.int 14336
    %842 = torch.prim.ListConstruct %int1_341, %564, %int14336_342 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %843 = torch.aten.view %841, %842 : !torch.vtensor<[?,14336],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %843, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    %int15_343 = torch.constant.int 15
    %844 = torch.prims.convert_element_type %843, %int15_343 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %844, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %845 = torch.aten.mul.Tensor %834, %844 : !torch.vtensor<[1,?,14336],bf16>, !torch.vtensor<[1,?,14336],bf16> -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %845, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %846 = torch.aten.div.Tensor %845, %16 : !torch.vtensor<[1,?,14336],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %846, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %float-2.400000e02_344 = torch.constant.float -2.400000e+02
    %float2.400000e02_345 = torch.constant.float 2.400000e+02
    %847 = torch.aten.clamp %846, %float-2.400000e02_344, %float2.400000e02_345 : !torch.vtensor<[1,?,14336],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %847, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %int26_346 = torch.constant.int 26
    %848 = torch.prims.convert_element_type %847, %int26_346 : !torch.vtensor<[1,?,14336],bf16>, !torch.int -> !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %848, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    %int-2_347 = torch.constant.int -2
    %int-1_348 = torch.constant.int -1
    %849 = torch.aten.transpose.int %17, %int-2_347, %int-1_348 : !torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %int1_349 = torch.constant.int 1
    %850 = torch.aten.size.int %832, %int1_349 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.int -> !torch.int
    %int14336_350 = torch.constant.int 14336
    %851 = torch.prim.ListConstruct %850, %int14336_350 : (!torch.int, !torch.int) -> !torch.list<int>
    %852 = torch.aten.view %848, %851 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %852, [%548], affine_map<()[s0] -> (s0 * 32, 14336)> : !torch.vtensor<[?,14336],f8E4M3FNUZ>
    %853 = torch.aten.mm %852, %849 : !torch.vtensor<[?,14336],f8E4M3FNUZ>, !torch.vtensor<[14336,4096],f8E4M3FNUZ> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %853, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %int1_351 = torch.constant.int 1
    %int4096_352 = torch.constant.int 4096
    %854 = torch.prim.ListConstruct %int1_351, %850, %int4096_352 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %855 = torch.aten.view %853, %854 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %855, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int15_353 = torch.constant.int 15
    %856 = torch.prims.convert_element_type %855, %int15_353 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %856, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int1_354 = torch.constant.int 1
    %857 = torch.aten.add.Tensor %814, %856, %int1_354 : !torch.vtensor<[1,?,4096],bf16>, !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %857, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int6_355 = torch.constant.int 6
    %858 = torch.prims.convert_element_type %857, %int6_355 : !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %858, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int2_356 = torch.constant.int 2
    %859 = torch.aten.pow.Tensor_Scalar %858, %int2_356 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %859, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int-1_357 = torch.constant.int -1
    %860 = torch.prim.ListConstruct %int-1_357 : (!torch.int) -> !torch.list<int>
    %true_358 = torch.constant.bool true
    %none_359 = torch.constant.none
    %861 = torch.aten.mean.dim %859, %860, %true_358, %none_359 : !torch.vtensor<[1,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %861, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %float1.000000e-05_360 = torch.constant.float 1.000000e-05
    %int1_361 = torch.constant.int 1
    %862 = torch.aten.add.Scalar %861, %float1.000000e-05_360, %int1_361 : !torch.vtensor<[1,?,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %862, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %863 = torch.aten.rsqrt %862 : !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %863, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %864 = torch.aten.mul.Tensor %858, %863 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %864, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int15_362 = torch.constant.int 15
    %865 = torch.prims.convert_element_type %864, %int15_362 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %865, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %866 = torch.aten.mul.Tensor %18, %865 : !torch.vtensor<[4096],bf16>, !torch.vtensor<[1,?,4096],bf16> -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %866, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %867 = torch.aten.div.Tensor %866, %19 : !torch.vtensor<[1,?,4096],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %867, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %float-2.400000e02_363 = torch.constant.float -2.400000e+02
    %float2.400000e02_364 = torch.constant.float 2.400000e+02
    %868 = torch.aten.clamp %867, %float-2.400000e02_363, %float2.400000e02_364 : !torch.vtensor<[1,?,4096],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %868, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int26_365 = torch.constant.int 26
    %869 = torch.prims.convert_element_type %868, %int26_365 : !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %869, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_366 = torch.constant.int -2
    %int-1_367 = torch.constant.int -1
    %870 = torch.aten.transpose.int %20, %int-2_366, %int-1_367 : !torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %int4096_368 = torch.constant.int 4096
    %871 = torch.prim.ListConstruct %564, %int4096_368 : (!torch.int, !torch.int) -> !torch.list<int>
    %872 = torch.aten.view %869, %871 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %872, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %873 = torch.aten.mm %872, %870 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %873, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %int1_369 = torch.constant.int 1
    %int4096_370 = torch.constant.int 4096
    %874 = torch.prim.ListConstruct %int1_369, %564, %int4096_370 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %875 = torch.aten.view %873, %874 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %875, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int15_371 = torch.constant.int 15
    %876 = torch.prims.convert_element_type %875, %int15_371 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %876, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %877 = torch.aten.div.Tensor %866, %21 : !torch.vtensor<[1,?,4096],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %877, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %float-2.400000e02_372 = torch.constant.float -2.400000e+02
    %float2.400000e02_373 = torch.constant.float 2.400000e+02
    %878 = torch.aten.clamp %877, %float-2.400000e02_372, %float2.400000e02_373 : !torch.vtensor<[1,?,4096],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %878, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int26_374 = torch.constant.int 26
    %879 = torch.prims.convert_element_type %878, %int26_374 : !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %879, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_375 = torch.constant.int -2
    %int-1_376 = torch.constant.int -1
    %880 = torch.aten.transpose.int %22, %int-2_375, %int-1_376 : !torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,1024],f8E4M3FNUZ>
    %int4096_377 = torch.constant.int 4096
    %881 = torch.prim.ListConstruct %564, %int4096_377 : (!torch.int, !torch.int) -> !torch.list<int>
    %882 = torch.aten.view %879, %881 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %882, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %883 = torch.aten.mm %882, %880 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,1024],f8E4M3FNUZ> -> !torch.vtensor<[?,1024],f8E4M3FNUZ>
    torch.bind_symbolic_shape %883, [%548], affine_map<()[s0] -> (s0 * 32, 1024)> : !torch.vtensor<[?,1024],f8E4M3FNUZ>
    %int1_378 = torch.constant.int 1
    %int1024_379 = torch.constant.int 1024
    %884 = torch.prim.ListConstruct %int1_378, %564, %int1024_379 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %885 = torch.aten.view %883, %884 : !torch.vtensor<[?,1024],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
    torch.bind_symbolic_shape %885, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
    %int15_380 = torch.constant.int 15
    %886 = torch.prims.convert_element_type %885, %int15_380 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,1024],bf16>
    torch.bind_symbolic_shape %886, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],bf16>
    %887 = torch.aten.div.Tensor %866, %23 : !torch.vtensor<[1,?,4096],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %887, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %float-2.400000e02_381 = torch.constant.float -2.400000e+02
    %float2.400000e02_382 = torch.constant.float 2.400000e+02
    %888 = torch.aten.clamp %887, %float-2.400000e02_381, %float2.400000e02_382 : !torch.vtensor<[1,?,4096],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %888, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int26_383 = torch.constant.int 26
    %889 = torch.prims.convert_element_type %888, %int26_383 : !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %889, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_384 = torch.constant.int -2
    %int-1_385 = torch.constant.int -1
    %890 = torch.aten.transpose.int %24, %int-2_384, %int-1_385 : !torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,1024],f8E4M3FNUZ>
    %int4096_386 = torch.constant.int 4096
    %891 = torch.prim.ListConstruct %564, %int4096_386 : (!torch.int, !torch.int) -> !torch.list<int>
    %892 = torch.aten.view %889, %891 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %892, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %893 = torch.aten.mm %892, %890 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,1024],f8E4M3FNUZ> -> !torch.vtensor<[?,1024],f8E4M3FNUZ>
    torch.bind_symbolic_shape %893, [%548], affine_map<()[s0] -> (s0 * 32, 1024)> : !torch.vtensor<[?,1024],f8E4M3FNUZ>
    %int1_387 = torch.constant.int 1
    %int1024_388 = torch.constant.int 1024
    %894 = torch.prim.ListConstruct %int1_387, %564, %int1024_388 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %895 = torch.aten.view %893, %894 : !torch.vtensor<[?,1024],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
    torch.bind_symbolic_shape %895, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
    %int15_389 = torch.constant.int 15
    %896 = torch.prims.convert_element_type %895, %int15_389 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,1024],bf16>
    torch.bind_symbolic_shape %896, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],bf16>
    %int1_390 = torch.constant.int 1
    %int32_391 = torch.constant.int 32
    %int128_392 = torch.constant.int 128
    %897 = torch.prim.ListConstruct %int1_390, %564, %int32_391, %int128_392 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %898 = torch.aten.view %876, %897 : !torch.vtensor<[1,?,4096],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,32,128],bf16>
    torch.bind_symbolic_shape %898, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
    %int1_393 = torch.constant.int 1
    %int8_394 = torch.constant.int 8
    %int128_395 = torch.constant.int 128
    %899 = torch.prim.ListConstruct %int1_393, %564, %int8_394, %int128_395 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %900 = torch.aten.view %886, %899 : !torch.vtensor<[1,?,1024],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %900, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %int1_396 = torch.constant.int 1
    %int8_397 = torch.constant.int 8
    %int128_398 = torch.constant.int 128
    %901 = torch.prim.ListConstruct %int1_396, %564, %int8_397, %int128_398 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %902 = torch.aten.view %896, %901 : !torch.vtensor<[1,?,1024],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %902, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %int131072_399 = torch.constant.int 131072
    %none_400 = torch.constant.none
    %none_401 = torch.constant.none
    %cpu_402 = torch.constant.device "cpu"
    %false_403 = torch.constant.bool false
    %903 = torch.aten.arange %int131072_399, %none_400, %none_401, %cpu_402, %false_403 : !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[131072],si64>
    %int0_404 = torch.constant.int 0
    %int128_405 = torch.constant.int 128
    %int2_406 = torch.constant.int 2
    %int4_407 = torch.constant.int 4
    %none_408 = torch.constant.none
    %cpu_409 = torch.constant.device "cpu"
    %false_410 = torch.constant.bool false
    %904 = torch.aten.arange.start_step %int0_404, %int128_405, %int2_406, %int4_407, %none_408, %cpu_409, %false_410 : !torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[64],si64>
    %int6_411 = torch.constant.int 6
    %905 = torch.prims.convert_element_type %904, %int6_411 : !torch.vtensor<[64],si64>, !torch.int -> !torch.vtensor<[64],f32>
    %int128_412 = torch.constant.int 128
    %906 = torch.aten.div.Scalar %905, %int128_412 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %float5.000000e05_413 = torch.constant.float 5.000000e+05
    %907 = torch.aten.pow.Scalar %float5.000000e05_413, %906 : !torch.float, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %908 = torch.aten.reciprocal %907 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %float1.000000e00_414 = torch.constant.float 1.000000e+00
    %909 = torch.aten.mul.Scalar %908, %float1.000000e00_414 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],f32>
    %910 = torch.aten.reciprocal %909 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %float6.283190e00_415 = torch.constant.float 6.2831853071795862
    %911 = torch.aten.mul.Scalar %910, %float6.283190e00_415 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],f32>
    %float8.192000e03_416 = torch.constant.float 8.192000e+03
    %912 = torch.aten.gt.Scalar %911, %float8.192000e03_416 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1>
    %int8_417 = torch.constant.int 8
    %913 = torch.aten.div.Scalar %909, %int8_417 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %914 = torch.aten.where.self %912, %913, %909 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %915 = torch.aten.reciprocal %911 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %int8192_418 = torch.constant.int 8192
    %916 = torch.aten.mul.Scalar %915, %int8192_418 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %int1_419 = torch.constant.int 1
    %int1_420 = torch.constant.int 1
    %917 = torch.aten.sub.Scalar %916, %int1_419, %int1_420 : !torch.vtensor<[64],f32>, !torch.int, !torch.int -> !torch.vtensor<[64],f32>
    %int3_421 = torch.constant.int 3
    %918 = torch.aten.div.Scalar %917, %int3_421 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %int1_422 = torch.constant.int 1
    %int1_423 = torch.constant.int 1
    %919 = torch.aten.rsub.Scalar %918, %int1_422, %int1_423 : !torch.vtensor<[64],f32>, !torch.int, !torch.int -> !torch.vtensor<[64],f32>
    %920 = torch.aten.mul.Tensor %919, %914 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %int8_424 = torch.constant.int 8
    %921 = torch.aten.div.Scalar %920, %int8_424 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %922 = torch.aten.mul.Tensor %918, %914 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %int1_425 = torch.constant.int 1
    %923 = torch.aten.add.Tensor %921, %922, %int1_425 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %float2.048000e03_426 = torch.constant.float 2.048000e+03
    %924 = torch.aten.lt.Scalar %911, %float2.048000e03_426 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1>
    %925 = torch.aten.bitwise_not %924 : !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1>
    %float8.192000e03_427 = torch.constant.float 8.192000e+03
    %926 = torch.aten.gt.Scalar %911, %float8.192000e03_427 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1>
    %927 = torch.aten.bitwise_not %926 : !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1>
    %928 = torch.aten.mul.Tensor %925, %927 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1>
    %929 = torch.aten.where.self %928, %923, %914 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %930 = torch.prim.ListConstruct %929, %929 : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor>
    %int-1_428 = torch.constant.int -1
    %931 = torch.aten.cat %930, %int-1_428 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[128],f32>
    %int6_429 = torch.constant.int 6
    %932 = torch.prims.convert_element_type %903, %int6_429 : !torch.vtensor<[131072],si64>, !torch.int -> !torch.vtensor<[131072],f32>
    %int131072_430 = torch.constant.int 131072
    %int1_431 = torch.constant.int 1
    %933 = torch.prim.ListConstruct %int131072_430, %int1_431 : (!torch.int, !torch.int) -> !torch.list<int>
    %934 = torch.aten.view %932, %933 : !torch.vtensor<[131072],f32>, !torch.list<int> -> !torch.vtensor<[131072,1],f32>
    %935 = torch.aten.mul.Tensor %934, %931 : !torch.vtensor<[131072,1],f32>, !torch.vtensor<[128],f32> -> !torch.vtensor<[131072,128],f32>
    %936 = torch.aten.cos %935 : !torch.vtensor<[131072,128],f32> -> !torch.vtensor<[131072,128],f32>
    %int15_432 = torch.constant.int 15
    %937 = torch.prims.convert_element_type %936, %int15_432 : !torch.vtensor<[131072,128],f32>, !torch.int -> !torch.vtensor<[131072,128],bf16>
    %938 = torch.aten.sin %935 : !torch.vtensor<[131072,128],f32> -> !torch.vtensor<[131072,128],f32>
    %int15_433 = torch.constant.int 15
    %939 = torch.prims.convert_element_type %938, %int15_433 : !torch.vtensor<[131072,128],f32>, !torch.int -> !torch.vtensor<[131072,128],bf16>
    %int1_434 = torch.constant.int 1
    %940 = torch.aten.size.int %875, %int1_434 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.int
    %int0_435 = torch.constant.int 0
    %941 = torch.aten.add.int %int0_435, %940 : !torch.int, !torch.int -> !torch.int
    %int0_436 = torch.constant.int 0
    %int0_437 = torch.constant.int 0
    %int1_438 = torch.constant.int 1
    %942 = torch.aten.slice.Tensor %937, %int0_436, %int0_437, %941, %int1_438 : !torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %942, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int1_439 = torch.constant.int 1
    %int0_440 = torch.constant.int 0
    %int9223372036854775807_441 = torch.constant.int 9223372036854775807
    %int1_442 = torch.constant.int 1
    %943 = torch.aten.slice.Tensor %942, %int1_439, %int0_440, %int9223372036854775807_441, %int1_442 : !torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %943, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int0_443 = torch.constant.int 0
    %944 = torch.aten.add.int %int0_443, %940 : !torch.int, !torch.int -> !torch.int
    %int0_444 = torch.constant.int 0
    %int0_445 = torch.constant.int 0
    %int1_446 = torch.constant.int 1
    %945 = torch.aten.slice.Tensor %939, %int0_444, %int0_445, %944, %int1_446 : !torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %945, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int1_447 = torch.constant.int 1
    %int0_448 = torch.constant.int 0
    %int9223372036854775807_449 = torch.constant.int 9223372036854775807
    %int1_450 = torch.constant.int 1
    %946 = torch.aten.slice.Tensor %945, %int1_447, %int0_448, %int9223372036854775807_449, %int1_450 : !torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %946, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int0_451 = torch.constant.int 0
    %947 = torch.aten.unsqueeze %943, %int0_451 : !torch.vtensor<[?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %947, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int1_452 = torch.constant.int 1
    %int0_453 = torch.constant.int 0
    %int9223372036854775807_454 = torch.constant.int 9223372036854775807
    %int1_455 = torch.constant.int 1
    %948 = torch.aten.slice.Tensor %947, %int1_452, %int0_453, %int9223372036854775807_454, %int1_455 : !torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %948, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int2_456 = torch.constant.int 2
    %949 = torch.aten.unsqueeze %948, %int2_456 : !torch.vtensor<[1,?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %949, [%548], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int3_457 = torch.constant.int 3
    %int0_458 = torch.constant.int 0
    %int9223372036854775807_459 = torch.constant.int 9223372036854775807
    %int1_460 = torch.constant.int 1
    %950 = torch.aten.slice.Tensor %949, %int3_457, %int0_458, %int9223372036854775807_459, %int1_460 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %950, [%548], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int0_461 = torch.constant.int 0
    %951 = torch.aten.unsqueeze %946, %int0_461 : !torch.vtensor<[?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %951, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int1_462 = torch.constant.int 1
    %int0_463 = torch.constant.int 0
    %int9223372036854775807_464 = torch.constant.int 9223372036854775807
    %int1_465 = torch.constant.int 1
    %952 = torch.aten.slice.Tensor %951, %int1_462, %int0_463, %int9223372036854775807_464, %int1_465 : !torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %952, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int2_466 = torch.constant.int 2
    %953 = torch.aten.unsqueeze %952, %int2_466 : !torch.vtensor<[1,?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %953, [%548], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int3_467 = torch.constant.int 3
    %int0_468 = torch.constant.int 0
    %int9223372036854775807_469 = torch.constant.int 9223372036854775807
    %int1_470 = torch.constant.int 1
    %954 = torch.aten.slice.Tensor %953, %int3_467, %int0_468, %int9223372036854775807_469, %int1_470 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %954, [%548], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int1_471 = torch.constant.int 1
    %int2_472 = torch.constant.int 2
    %955 = torch.aten.transpose.int %950, %int1_471, %int2_472 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,?,128],bf16>
    torch.bind_symbolic_shape %955, [%548], affine_map<()[s0] -> (1, 1, s0 * 32, 128)> : !torch.vtensor<[1,1,?,128],bf16>
    %int1_473 = torch.constant.int 1
    %int1_474 = torch.constant.int 1
    %int1_475 = torch.constant.int 1
    %int1_476 = torch.constant.int 1
    %956 = torch.prim.ListConstruct %int1_473, %int1_474, %int1_475, %int1_476 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %957 = torch.aten.repeat %955, %956 : !torch.vtensor<[1,1,?,128],bf16>, !torch.list<int> -> !torch.vtensor<[1,1,?,128],bf16>
    torch.bind_symbolic_shape %957, [%548], affine_map<()[s0] -> (1, 1, s0 * 32, 128)> : !torch.vtensor<[1,1,?,128],bf16>
    %int1_477 = torch.constant.int 1
    %int2_478 = torch.constant.int 2
    %958 = torch.aten.transpose.int %954, %int1_477, %int2_478 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,?,128],bf16>
    torch.bind_symbolic_shape %958, [%548], affine_map<()[s0] -> (1, 1, s0 * 32, 128)> : !torch.vtensor<[1,1,?,128],bf16>
    %int1_479 = torch.constant.int 1
    %int2_480 = torch.constant.int 2
    %959 = torch.aten.transpose.int %898, %int1_479, %int2_480 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
    torch.bind_symbolic_shape %959, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %int1_481 = torch.constant.int 1
    %int1_482 = torch.constant.int 1
    %int1_483 = torch.constant.int 1
    %int1_484 = torch.constant.int 1
    %960 = torch.prim.ListConstruct %int1_481, %int1_482, %int1_483, %int1_484 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %961 = torch.aten.repeat %958, %960 : !torch.vtensor<[1,1,?,128],bf16>, !torch.list<int> -> !torch.vtensor<[1,1,?,128],bf16>
    torch.bind_symbolic_shape %961, [%548], affine_map<()[s0] -> (1, 1, s0 * 32, 128)> : !torch.vtensor<[1,1,?,128],bf16>
    %962 = torch.aten.mul.Tensor %959, %957 : !torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,1,?,128],bf16> -> !torch.vtensor<[1,32,?,128],bf16>
    torch.bind_symbolic_shape %962, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %int3_485 = torch.constant.int 3
    %int0_486 = torch.constant.int 0
    %int64_487 = torch.constant.int 64
    %int1_488 = torch.constant.int 1
    %963 = torch.aten.slice.Tensor %959, %int3_485, %int0_486, %int64_487, %int1_488 : !torch.vtensor<[1,32,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,64],bf16>
    torch.bind_symbolic_shape %963, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 64)> : !torch.vtensor<[1,32,?,64],bf16>
    %int3_489 = torch.constant.int 3
    %int64_490 = torch.constant.int 64
    %int9223372036854775807_491 = torch.constant.int 9223372036854775807
    %int1_492 = torch.constant.int 1
    %964 = torch.aten.slice.Tensor %959, %int3_489, %int64_490, %int9223372036854775807_491, %int1_492 : !torch.vtensor<[1,32,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,64],bf16>
    torch.bind_symbolic_shape %964, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 64)> : !torch.vtensor<[1,32,?,64],bf16>
    %965 = torch.aten.neg %964 : !torch.vtensor<[1,32,?,64],bf16> -> !torch.vtensor<[1,32,?,64],bf16>
    torch.bind_symbolic_shape %965, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 64)> : !torch.vtensor<[1,32,?,64],bf16>
    %966 = torch.prim.ListConstruct %965, %963 : (!torch.vtensor<[1,32,?,64],bf16>, !torch.vtensor<[1,32,?,64],bf16>) -> !torch.list<vtensor>
    %int-1_493 = torch.constant.int -1
    %967 = torch.aten.cat %966, %int-1_493 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
    torch.bind_symbolic_shape %967, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %968 = torch.aten.mul.Tensor %967, %961 : !torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,1,?,128],bf16> -> !torch.vtensor<[1,32,?,128],bf16>
    torch.bind_symbolic_shape %968, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %int1_494 = torch.constant.int 1
    %969 = torch.aten.add.Tensor %962, %968, %int1_494 : !torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,32,?,128],bf16>, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
    torch.bind_symbolic_shape %969, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %int1_495 = torch.constant.int 1
    %int2_496 = torch.constant.int 2
    %970 = torch.aten.transpose.int %969, %int1_495, %int2_496 : !torch.vtensor<[1,32,?,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
    torch.bind_symbolic_shape %970, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
    %int131072_497 = torch.constant.int 131072
    %none_498 = torch.constant.none
    %none_499 = torch.constant.none
    %cpu_500 = torch.constant.device "cpu"
    %false_501 = torch.constant.bool false
    %971 = torch.aten.arange %int131072_497, %none_498, %none_499, %cpu_500, %false_501 : !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[131072],si64>
    %int0_502 = torch.constant.int 0
    %int128_503 = torch.constant.int 128
    %int2_504 = torch.constant.int 2
    %int4_505 = torch.constant.int 4
    %none_506 = torch.constant.none
    %cpu_507 = torch.constant.device "cpu"
    %false_508 = torch.constant.bool false
    %972 = torch.aten.arange.start_step %int0_502, %int128_503, %int2_504, %int4_505, %none_506, %cpu_507, %false_508 : !torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[64],si64>
    %int6_509 = torch.constant.int 6
    %973 = torch.prims.convert_element_type %972, %int6_509 : !torch.vtensor<[64],si64>, !torch.int -> !torch.vtensor<[64],f32>
    %int128_510 = torch.constant.int 128
    %974 = torch.aten.div.Scalar %973, %int128_510 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %float5.000000e05_511 = torch.constant.float 5.000000e+05
    %975 = torch.aten.pow.Scalar %float5.000000e05_511, %974 : !torch.float, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %976 = torch.aten.reciprocal %975 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %float1.000000e00_512 = torch.constant.float 1.000000e+00
    %977 = torch.aten.mul.Scalar %976, %float1.000000e00_512 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],f32>
    %978 = torch.aten.reciprocal %977 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %float6.283190e00_513 = torch.constant.float 6.2831853071795862
    %979 = torch.aten.mul.Scalar %978, %float6.283190e00_513 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],f32>
    %float8.192000e03_514 = torch.constant.float 8.192000e+03
    %980 = torch.aten.gt.Scalar %979, %float8.192000e03_514 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1>
    %int8_515 = torch.constant.int 8
    %981 = torch.aten.div.Scalar %977, %int8_515 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %982 = torch.aten.where.self %980, %981, %977 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %983 = torch.aten.reciprocal %979 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %int8192_516 = torch.constant.int 8192
    %984 = torch.aten.mul.Scalar %983, %int8192_516 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %int1_517 = torch.constant.int 1
    %int1_518 = torch.constant.int 1
    %985 = torch.aten.sub.Scalar %984, %int1_517, %int1_518 : !torch.vtensor<[64],f32>, !torch.int, !torch.int -> !torch.vtensor<[64],f32>
    %int3_519 = torch.constant.int 3
    %986 = torch.aten.div.Scalar %985, %int3_519 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %int1_520 = torch.constant.int 1
    %int1_521 = torch.constant.int 1
    %987 = torch.aten.rsub.Scalar %986, %int1_520, %int1_521 : !torch.vtensor<[64],f32>, !torch.int, !torch.int -> !torch.vtensor<[64],f32>
    %988 = torch.aten.mul.Tensor %987, %982 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %int8_522 = torch.constant.int 8
    %989 = torch.aten.div.Scalar %988, %int8_522 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %990 = torch.aten.mul.Tensor %986, %982 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %int1_523 = torch.constant.int 1
    %991 = torch.aten.add.Tensor %989, %990, %int1_523 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %float2.048000e03_524 = torch.constant.float 2.048000e+03
    %992 = torch.aten.lt.Scalar %979, %float2.048000e03_524 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1>
    %993 = torch.aten.bitwise_not %992 : !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1>
    %float8.192000e03_525 = torch.constant.float 8.192000e+03
    %994 = torch.aten.gt.Scalar %979, %float8.192000e03_525 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1>
    %995 = torch.aten.bitwise_not %994 : !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1>
    %996 = torch.aten.mul.Tensor %993, %995 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1>
    %997 = torch.aten.where.self %996, %991, %982 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %998 = torch.prim.ListConstruct %997, %997 : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor>
    %int-1_526 = torch.constant.int -1
    %999 = torch.aten.cat %998, %int-1_526 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[128],f32>
    %int6_527 = torch.constant.int 6
    %1000 = torch.prims.convert_element_type %971, %int6_527 : !torch.vtensor<[131072],si64>, !torch.int -> !torch.vtensor<[131072],f32>
    %int131072_528 = torch.constant.int 131072
    %int1_529 = torch.constant.int 1
    %1001 = torch.prim.ListConstruct %int131072_528, %int1_529 : (!torch.int, !torch.int) -> !torch.list<int>
    %1002 = torch.aten.view %1000, %1001 : !torch.vtensor<[131072],f32>, !torch.list<int> -> !torch.vtensor<[131072,1],f32>
    %1003 = torch.aten.mul.Tensor %1002, %999 : !torch.vtensor<[131072,1],f32>, !torch.vtensor<[128],f32> -> !torch.vtensor<[131072,128],f32>
    %1004 = torch.aten.cos %1003 : !torch.vtensor<[131072,128],f32> -> !torch.vtensor<[131072,128],f32>
    %int15_530 = torch.constant.int 15
    %1005 = torch.prims.convert_element_type %1004, %int15_530 : !torch.vtensor<[131072,128],f32>, !torch.int -> !torch.vtensor<[131072,128],bf16>
    %1006 = torch.aten.sin %1003 : !torch.vtensor<[131072,128],f32> -> !torch.vtensor<[131072,128],f32>
    %int15_531 = torch.constant.int 15
    %1007 = torch.prims.convert_element_type %1006, %int15_531 : !torch.vtensor<[131072,128],f32>, !torch.int -> !torch.vtensor<[131072,128],bf16>
    %int1_532 = torch.constant.int 1
    %1008 = torch.aten.size.int %885, %int1_532 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.int
    %int0_533 = torch.constant.int 0
    %1009 = torch.aten.add.int %int0_533, %1008 : !torch.int, !torch.int -> !torch.int
    %int0_534 = torch.constant.int 0
    %int0_535 = torch.constant.int 0
    %int1_536 = torch.constant.int 1
    %1010 = torch.aten.slice.Tensor %1005, %int0_534, %int0_535, %1009, %int1_536 : !torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %1010, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int1_537 = torch.constant.int 1
    %int0_538 = torch.constant.int 0
    %int9223372036854775807_539 = torch.constant.int 9223372036854775807
    %int1_540 = torch.constant.int 1
    %1011 = torch.aten.slice.Tensor %1010, %int1_537, %int0_538, %int9223372036854775807_539, %int1_540 : !torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %1011, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int0_541 = torch.constant.int 0
    %1012 = torch.aten.add.int %int0_541, %1008 : !torch.int, !torch.int -> !torch.int
    %int0_542 = torch.constant.int 0
    %int0_543 = torch.constant.int 0
    %int1_544 = torch.constant.int 1
    %1013 = torch.aten.slice.Tensor %1007, %int0_542, %int0_543, %1012, %int1_544 : !torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %1013, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int1_545 = torch.constant.int 1
    %int0_546 = torch.constant.int 0
    %int9223372036854775807_547 = torch.constant.int 9223372036854775807
    %int1_548 = torch.constant.int 1
    %1014 = torch.aten.slice.Tensor %1013, %int1_545, %int0_546, %int9223372036854775807_547, %int1_548 : !torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %1014, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int0_549 = torch.constant.int 0
    %1015 = torch.aten.unsqueeze %1011, %int0_549 : !torch.vtensor<[?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %1015, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int1_550 = torch.constant.int 1
    %int0_551 = torch.constant.int 0
    %int9223372036854775807_552 = torch.constant.int 9223372036854775807
    %int1_553 = torch.constant.int 1
    %1016 = torch.aten.slice.Tensor %1015, %int1_550, %int0_551, %int9223372036854775807_552, %int1_553 : !torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %1016, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int2_554 = torch.constant.int 2
    %1017 = torch.aten.unsqueeze %1016, %int2_554 : !torch.vtensor<[1,?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %1017, [%548], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int3_555 = torch.constant.int 3
    %int0_556 = torch.constant.int 0
    %int9223372036854775807_557 = torch.constant.int 9223372036854775807
    %int1_558 = torch.constant.int 1
    %1018 = torch.aten.slice.Tensor %1017, %int3_555, %int0_556, %int9223372036854775807_557, %int1_558 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %1018, [%548], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int0_559 = torch.constant.int 0
    %1019 = torch.aten.unsqueeze %1014, %int0_559 : !torch.vtensor<[?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %1019, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int1_560 = torch.constant.int 1
    %int0_561 = torch.constant.int 0
    %int9223372036854775807_562 = torch.constant.int 9223372036854775807
    %int1_563 = torch.constant.int 1
    %1020 = torch.aten.slice.Tensor %1019, %int1_560, %int0_561, %int9223372036854775807_562, %int1_563 : !torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %1020, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int2_564 = torch.constant.int 2
    %1021 = torch.aten.unsqueeze %1020, %int2_564 : !torch.vtensor<[1,?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %1021, [%548], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int3_565 = torch.constant.int 3
    %int0_566 = torch.constant.int 0
    %int9223372036854775807_567 = torch.constant.int 9223372036854775807
    %int1_568 = torch.constant.int 1
    %1022 = torch.aten.slice.Tensor %1021, %int3_565, %int0_566, %int9223372036854775807_567, %int1_568 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %1022, [%548], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int1_569 = torch.constant.int 1
    %int2_570 = torch.constant.int 2
    %1023 = torch.aten.transpose.int %1018, %int1_569, %int2_570 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,?,128],bf16>
    torch.bind_symbolic_shape %1023, [%548], affine_map<()[s0] -> (1, 1, s0 * 32, 128)> : !torch.vtensor<[1,1,?,128],bf16>
    %int1_571 = torch.constant.int 1
    %int1_572 = torch.constant.int 1
    %int1_573 = torch.constant.int 1
    %int1_574 = torch.constant.int 1
    %1024 = torch.prim.ListConstruct %int1_571, %int1_572, %int1_573, %int1_574 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1025 = torch.aten.repeat %1023, %1024 : !torch.vtensor<[1,1,?,128],bf16>, !torch.list<int> -> !torch.vtensor<[1,1,?,128],bf16>
    torch.bind_symbolic_shape %1025, [%548], affine_map<()[s0] -> (1, 1, s0 * 32, 128)> : !torch.vtensor<[1,1,?,128],bf16>
    %int1_575 = torch.constant.int 1
    %int2_576 = torch.constant.int 2
    %1026 = torch.aten.transpose.int %1022, %int1_575, %int2_576 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,?,128],bf16>
    torch.bind_symbolic_shape %1026, [%548], affine_map<()[s0] -> (1, 1, s0 * 32, 128)> : !torch.vtensor<[1,1,?,128],bf16>
    %int1_577 = torch.constant.int 1
    %int2_578 = torch.constant.int 2
    %1027 = torch.aten.transpose.int %900, %int1_577, %int2_578 : !torch.vtensor<[1,?,8,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,8,?,128],bf16>
    torch.bind_symbolic_shape %1027, [%548], affine_map<()[s0] -> (1, 8, s0 * 32, 128)> : !torch.vtensor<[1,8,?,128],bf16>
    %int1_579 = torch.constant.int 1
    %int1_580 = torch.constant.int 1
    %int1_581 = torch.constant.int 1
    %int1_582 = torch.constant.int 1
    %1028 = torch.prim.ListConstruct %int1_579, %int1_580, %int1_581, %int1_582 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1029 = torch.aten.repeat %1026, %1028 : !torch.vtensor<[1,1,?,128],bf16>, !torch.list<int> -> !torch.vtensor<[1,1,?,128],bf16>
    torch.bind_symbolic_shape %1029, [%548], affine_map<()[s0] -> (1, 1, s0 * 32, 128)> : !torch.vtensor<[1,1,?,128],bf16>
    %1030 = torch.aten.mul.Tensor %1027, %1025 : !torch.vtensor<[1,8,?,128],bf16>, !torch.vtensor<[1,1,?,128],bf16> -> !torch.vtensor<[1,8,?,128],bf16>
    torch.bind_symbolic_shape %1030, [%548], affine_map<()[s0] -> (1, 8, s0 * 32, 128)> : !torch.vtensor<[1,8,?,128],bf16>
    %int3_583 = torch.constant.int 3
    %int0_584 = torch.constant.int 0
    %int64_585 = torch.constant.int 64
    %int1_586 = torch.constant.int 1
    %1031 = torch.aten.slice.Tensor %1027, %int3_583, %int0_584, %int64_585, %int1_586 : !torch.vtensor<[1,8,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,8,?,64],bf16>
    torch.bind_symbolic_shape %1031, [%548], affine_map<()[s0] -> (1, 8, s0 * 32, 64)> : !torch.vtensor<[1,8,?,64],bf16>
    %int3_587 = torch.constant.int 3
    %int64_588 = torch.constant.int 64
    %int9223372036854775807_589 = torch.constant.int 9223372036854775807
    %int1_590 = torch.constant.int 1
    %1032 = torch.aten.slice.Tensor %1027, %int3_587, %int64_588, %int9223372036854775807_589, %int1_590 : !torch.vtensor<[1,8,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,8,?,64],bf16>
    torch.bind_symbolic_shape %1032, [%548], affine_map<()[s0] -> (1, 8, s0 * 32, 64)> : !torch.vtensor<[1,8,?,64],bf16>
    %1033 = torch.aten.neg %1032 : !torch.vtensor<[1,8,?,64],bf16> -> !torch.vtensor<[1,8,?,64],bf16>
    torch.bind_symbolic_shape %1033, [%548], affine_map<()[s0] -> (1, 8, s0 * 32, 64)> : !torch.vtensor<[1,8,?,64],bf16>
    %1034 = torch.prim.ListConstruct %1033, %1031 : (!torch.vtensor<[1,8,?,64],bf16>, !torch.vtensor<[1,8,?,64],bf16>) -> !torch.list<vtensor>
    %int-1_591 = torch.constant.int -1
    %1035 = torch.aten.cat %1034, %int-1_591 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,8,?,128],bf16>
    torch.bind_symbolic_shape %1035, [%548], affine_map<()[s0] -> (1, 8, s0 * 32, 128)> : !torch.vtensor<[1,8,?,128],bf16>
    %1036 = torch.aten.mul.Tensor %1035, %1029 : !torch.vtensor<[1,8,?,128],bf16>, !torch.vtensor<[1,1,?,128],bf16> -> !torch.vtensor<[1,8,?,128],bf16>
    torch.bind_symbolic_shape %1036, [%548], affine_map<()[s0] -> (1, 8, s0 * 32, 128)> : !torch.vtensor<[1,8,?,128],bf16>
    %int1_592 = torch.constant.int 1
    %1037 = torch.aten.add.Tensor %1030, %1036, %int1_592 : !torch.vtensor<[1,8,?,128],bf16>, !torch.vtensor<[1,8,?,128],bf16>, !torch.int -> !torch.vtensor<[1,8,?,128],bf16>
    torch.bind_symbolic_shape %1037, [%548], affine_map<()[s0] -> (1, 8, s0 * 32, 128)> : !torch.vtensor<[1,8,?,128],bf16>
    %int1_593 = torch.constant.int 1
    %int2_594 = torch.constant.int 2
    %1038 = torch.aten.transpose.int %1037, %int1_593, %int2_594 : !torch.vtensor<[1,8,?,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %1038, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %1039 = torch.aten.div.Tensor %1038, %25 : !torch.vtensor<[1,?,8,128],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %1039, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %float-2.400000e02_595 = torch.constant.float -2.400000e+02
    %float2.400000e02_596 = torch.constant.float 2.400000e+02
    %1040 = torch.aten.clamp %1039, %float-2.400000e02_595, %float2.400000e02_596 : !torch.vtensor<[1,?,8,128],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %1040, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %int26_597 = torch.constant.int 26
    %1041 = torch.prims.convert_element_type %1040, %int26_597 : !torch.vtensor<[1,?,8,128],bf16>, !torch.int -> !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1041, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
    %1042 = torch.aten.div.Tensor %902, %25 : !torch.vtensor<[1,?,8,128],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %1042, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %float-2.400000e02_598 = torch.constant.float -2.400000e+02
    %float2.400000e02_599 = torch.constant.float 2.400000e+02
    %1043 = torch.aten.clamp %1042, %float-2.400000e02_598, %float2.400000e02_599 : !torch.vtensor<[1,?,8,128],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %1043, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %int26_600 = torch.constant.int 26
    %1044 = torch.prims.convert_element_type %1043, %int26_600 : !torch.vtensor<[1,?,8,128],bf16>, !torch.int -> !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1044, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
    %int64_601 = torch.constant.int 64
    %1045 = torch.aten.mul.Scalar %arg2, %int64_601 : !torch.vtensor<[1,?],si64>, !torch.int -> !torch.vtensor<[1,?],si64>
    torch.bind_symbolic_shape %1045, [%548], affine_map<()[s0] -> (1, s0)> : !torch.vtensor<[1,?],si64>
    %int2_602 = torch.constant.int 2
    %int1_603 = torch.constant.int 1
    %1046 = torch.aten.add.Scalar %1045, %int2_602, %int1_603 : !torch.vtensor<[1,?],si64>, !torch.int, !torch.int -> !torch.vtensor<[1,?],si64>
    torch.bind_symbolic_shape %1046, [%548], affine_map<()[s0] -> (1, s0)> : !torch.vtensor<[1,?],si64>
    %int1_604 = torch.constant.int 1
    %int32_605 = torch.constant.int 32
    %int8_606 = torch.constant.int 8
    %int128_607 = torch.constant.int 128
    %1047 = torch.prim.ListConstruct %int1_604, %748, %int32_605, %int8_606, %int128_607 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1048 = torch.aten.view %1041, %1047 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1048, [%548], affine_map<()[s0] -> (1, s0, 32, 8, 128)> : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
    %int32_608 = torch.constant.int 32
    %int8_609 = torch.constant.int 8
    %int128_610 = torch.constant.int 128
    %1049 = torch.prim.ListConstruct %748, %int32_608, %int8_609, %int128_610 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1050 = torch.aten.view %1048, %1049 : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1050, [%548], affine_map<()[s0] -> (s0, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %1051 = torch.prim.ListConstruct %748 : (!torch.int) -> !torch.list<int>
    %1052 = torch.aten.view %1046, %1051 : !torch.vtensor<[1,?],si64>, !torch.list<int> -> !torch.vtensor<[?],si64>
    torch.bind_symbolic_shape %1052, [%548], affine_map<()[s0] -> (s0)> : !torch.vtensor<[?],si64>
    %int32_611 = torch.constant.int 32
    %int2_612 = torch.constant.int 2
    %int32_613 = torch.constant.int 32
    %int8_614 = torch.constant.int 8
    %int128_615 = torch.constant.int 128
    %1053 = torch.prim.ListConstruct %739, %int32_611, %int2_612, %int32_613, %int8_614, %int128_615 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1054 = torch.aten.view %777, %1053 : !torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1054, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    %int32_616 = torch.constant.int 32
    %1055 = torch.aten.mul.int %739, %int32_616 : !torch.int, !torch.int -> !torch.int
    %int2_617 = torch.constant.int 2
    %1056 = torch.aten.mul.int %1055, %int2_617 : !torch.int, !torch.int -> !torch.int
    %int32_618 = torch.constant.int 32
    %int8_619 = torch.constant.int 8
    %int128_620 = torch.constant.int 128
    %1057 = torch.prim.ListConstruct %1056, %int32_618, %int8_619, %int128_620 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1058 = torch.aten.view %1054, %1057 : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1058, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %1059 = torch.prim.ListConstruct %1052 : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %false_621 = torch.constant.bool false
    %1060 = torch.aten.index_put %1058, %1059, %1050, %false_621 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1060, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %int32_622 = torch.constant.int 32
    %int2_623 = torch.constant.int 2
    %int32_624 = torch.constant.int 32
    %int8_625 = torch.constant.int 8
    %int128_626 = torch.constant.int 128
    %1061 = torch.prim.ListConstruct %739, %int32_622, %int2_623, %int32_624, %int8_625, %int128_626 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1062 = torch.aten.view %1060, %1061 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1062, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    %int2097152_627 = torch.constant.int 2097152
    %1063 = torch.prim.ListConstruct %739, %int2097152_627 : (!torch.int, !torch.int) -> !torch.list<int>
    %1064 = torch.aten.view %1062, %1063 : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1064, [%549], affine_map<()[s0] -> (s0, 2097152)> : !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    %int32_628 = torch.constant.int 32
    %int2_629 = torch.constant.int 2
    %int32_630 = torch.constant.int 32
    %int8_631 = torch.constant.int 8
    %int128_632 = torch.constant.int 128
    %1065 = torch.prim.ListConstruct %739, %int32_628, %int2_629, %int32_630, %int8_631, %int128_632 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1066 = torch.aten.view %1064, %1065 : !torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1066, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    %int32_633 = torch.constant.int 32
    %int8_634 = torch.constant.int 8
    %int128_635 = torch.constant.int 128
    %1067 = torch.prim.ListConstruct %1056, %int32_633, %int8_634, %int128_635 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1068 = torch.aten.view %1066, %1067 : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1068, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %int1_636 = torch.constant.int 1
    %int32_637 = torch.constant.int 32
    %int8_638 = torch.constant.int 8
    %int128_639 = torch.constant.int 128
    %1069 = torch.prim.ListConstruct %int1_636, %748, %int32_637, %int8_638, %int128_639 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1070 = torch.aten.view %1044, %1069 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1070, [%548], affine_map<()[s0] -> (1, s0, 32, 8, 128)> : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
    %int32_640 = torch.constant.int 32
    %int8_641 = torch.constant.int 8
    %int128_642 = torch.constant.int 128
    %1071 = torch.prim.ListConstruct %748, %int32_640, %int8_641, %int128_642 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1072 = torch.aten.view %1070, %1071 : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1072, [%548], affine_map<()[s0] -> (s0, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %int1_643 = torch.constant.int 1
    %int1_644 = torch.constant.int 1
    %1073 = torch.aten.add.Scalar %1046, %int1_643, %int1_644 : !torch.vtensor<[1,?],si64>, !torch.int, !torch.int -> !torch.vtensor<[1,?],si64>
    torch.bind_symbolic_shape %1073, [%548], affine_map<()[s0] -> (1, s0)> : !torch.vtensor<[1,?],si64>
    %1074 = torch.prim.ListConstruct %748 : (!torch.int) -> !torch.list<int>
    %1075 = torch.aten.view %1073, %1074 : !torch.vtensor<[1,?],si64>, !torch.list<int> -> !torch.vtensor<[?],si64>
    torch.bind_symbolic_shape %1075, [%548], affine_map<()[s0] -> (s0)> : !torch.vtensor<[?],si64>
    %1076 = torch.prim.ListConstruct %1075 : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %false_645 = torch.constant.bool false
    %1077 = torch.aten.index_put %1068, %1076, %1072, %false_645 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1077, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %int32_646 = torch.constant.int 32
    %int2_647 = torch.constant.int 2
    %int32_648 = torch.constant.int 32
    %int8_649 = torch.constant.int 8
    %int128_650 = torch.constant.int 128
    %1078 = torch.prim.ListConstruct %739, %int32_646, %int2_647, %int32_648, %int8_649, %int128_650 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1079 = torch.aten.view %1077, %1078 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1079, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    %int2097152_651 = torch.constant.int 2097152
    %1080 = torch.prim.ListConstruct %739, %int2097152_651 : (!torch.int, !torch.int) -> !torch.list<int>
    %1081 = torch.aten.view %1079, %1080 : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1081, [%549], affine_map<()[s0] -> (s0, 2097152)> : !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    %int-2_652 = torch.constant.int -2
    %1082 = torch.aten.unsqueeze %1041, %int-2_652 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1082, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 1, 128)> : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
    %int1_653 = torch.constant.int 1
    %int8_654 = torch.constant.int 8
    %int4_655 = torch.constant.int 4
    %int128_656 = torch.constant.int 128
    %1083 = torch.prim.ListConstruct %int1_653, %1008, %int8_654, %int4_655, %int128_656 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %false_657 = torch.constant.bool false
    %1084 = torch.aten.expand %1082, %1083, %false_657 : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1084, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    %int0_658 = torch.constant.int 0
    %1085 = torch.aten.clone %1084, %int0_658 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1085, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    %int1_659 = torch.constant.int 1
    %int32_660 = torch.constant.int 32
    %int128_661 = torch.constant.int 128
    %1086 = torch.prim.ListConstruct %int1_659, %1008, %int32_660, %int128_661 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1087 = torch.aten._unsafe_view %1085, %1086 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1087, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
    %int-2_662 = torch.constant.int -2
    %1088 = torch.aten.unsqueeze %1044, %int-2_662 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1088, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 1, 128)> : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
    %int1_663 = torch.constant.int 1
    %1089 = torch.aten.size.int %895, %int1_663 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.int
    %int1_664 = torch.constant.int 1
    %int8_665 = torch.constant.int 8
    %int4_666 = torch.constant.int 4
    %int128_667 = torch.constant.int 128
    %1090 = torch.prim.ListConstruct %int1_664, %1089, %int8_665, %int4_666, %int128_667 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %false_668 = torch.constant.bool false
    %1091 = torch.aten.expand %1088, %1090, %false_668 : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1091, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    %int0_669 = torch.constant.int 0
    %1092 = torch.aten.clone %1091, %int0_669 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1092, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    %int1_670 = torch.constant.int 1
    %int32_671 = torch.constant.int 32
    %int128_672 = torch.constant.int 128
    %1093 = torch.prim.ListConstruct %int1_670, %1089, %int32_671, %int128_672 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1094 = torch.aten._unsafe_view %1092, %1093 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1094, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
    %int6_673 = torch.constant.int 6
    %1095 = torch.prims.convert_element_type %1087, %int6_673 : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,32,128],f32>
    torch.bind_symbolic_shape %1095, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
    %1096 = torch.aten.mul.Tensor %1095, %25 : !torch.vtensor<[1,?,32,128],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,32,128],f32>
    torch.bind_symbolic_shape %1096, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
    %int15_674 = torch.constant.int 15
    %1097 = torch.prims.convert_element_type %1096, %int15_674 : !torch.vtensor<[1,?,32,128],f32>, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
    torch.bind_symbolic_shape %1097, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
    %int6_675 = torch.constant.int 6
    %1098 = torch.prims.convert_element_type %1094, %int6_675 : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,32,128],f32>
    torch.bind_symbolic_shape %1098, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
    %1099 = torch.aten.mul.Tensor %1098, %25 : !torch.vtensor<[1,?,32,128],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,32,128],f32>
    torch.bind_symbolic_shape %1099, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
    %int15_676 = torch.constant.int 15
    %1100 = torch.prims.convert_element_type %1099, %int15_676 : !torch.vtensor<[1,?,32,128],f32>, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
    torch.bind_symbolic_shape %1100, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
    %int1_677 = torch.constant.int 1
    %int2_678 = torch.constant.int 2
    %1101 = torch.aten.transpose.int %970, %int1_677, %int2_678 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
    torch.bind_symbolic_shape %1101, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %int1_679 = torch.constant.int 1
    %int2_680 = torch.constant.int 2
    %1102 = torch.aten.transpose.int %1097, %int1_679, %int2_680 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
    torch.bind_symbolic_shape %1102, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %int1_681 = torch.constant.int 1
    %int2_682 = torch.constant.int 2
    %1103 = torch.aten.transpose.int %1100, %int1_681, %int2_682 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
    torch.bind_symbolic_shape %1103, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %float0.000000e00_683 = torch.constant.float 0.000000e+00
    %true_684 = torch.constant.bool true
    %none_685 = torch.constant.none
    %none_686 = torch.constant.none
    %1104:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%1101, %1102, %1103, %float0.000000e00_683, %true_684, %none_685, %none_686) : (!torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,32,?,128],bf16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,32,?],f32>) 
    torch.bind_symbolic_shape %1104#0, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %int1_687 = torch.constant.int 1
    %int2_688 = torch.constant.int 2
    %1105 = torch.aten.transpose.int %1104#0, %int1_687, %int2_688 : !torch.vtensor<[1,32,?,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
    torch.bind_symbolic_shape %1105, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
    %int1_689 = torch.constant.int 1
    %int4096_690 = torch.constant.int 4096
    %1106 = torch.prim.ListConstruct %int1_689, %940, %int4096_690 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1107 = torch.aten.view %1105, %1106 : !torch.vtensor<[1,?,32,128],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %1107, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %1108 = torch.aten.div.Tensor %1107, %26 : !torch.vtensor<[1,?,4096],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %1108, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %float-2.400000e02_691 = torch.constant.float -2.400000e+02
    %float2.400000e02_692 = torch.constant.float 2.400000e+02
    %1109 = torch.aten.clamp %1108, %float-2.400000e02_691, %float2.400000e02_692 : !torch.vtensor<[1,?,4096],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %1109, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int26_693 = torch.constant.int 26
    %1110 = torch.prims.convert_element_type %1109, %int26_693 : !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1110, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_694 = torch.constant.int -2
    %int-1_695 = torch.constant.int -1
    %1111 = torch.aten.transpose.int %27, %int-2_694, %int-1_695 : !torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %int4096_696 = torch.constant.int 4096
    %1112 = torch.prim.ListConstruct %940, %int4096_696 : (!torch.int, !torch.int) -> !torch.list<int>
    %1113 = torch.aten.view %1110, %1112 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1113, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %1114 = torch.aten.mm %1113, %1111 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1114, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %int1_697 = torch.constant.int 1
    %int4096_698 = torch.constant.int 4096
    %1115 = torch.prim.ListConstruct %int1_697, %940, %int4096_698 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1116 = torch.aten.view %1114, %1115 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1116, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int15_699 = torch.constant.int 15
    %1117 = torch.prims.convert_element_type %1116, %int15_699 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %1117, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int1_700 = torch.constant.int 1
    %1118 = torch.aten.add.Tensor %857, %1117, %int1_700 : !torch.vtensor<[1,?,4096],bf16>, !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %1118, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int6_701 = torch.constant.int 6
    %1119 = torch.prims.convert_element_type %1118, %int6_701 : !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %1119, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int2_702 = torch.constant.int 2
    %1120 = torch.aten.pow.Tensor_Scalar %1119, %int2_702 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %1120, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int-1_703 = torch.constant.int -1
    %1121 = torch.prim.ListConstruct %int-1_703 : (!torch.int) -> !torch.list<int>
    %true_704 = torch.constant.bool true
    %none_705 = torch.constant.none
    %1122 = torch.aten.mean.dim %1120, %1121, %true_704, %none_705 : !torch.vtensor<[1,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %1122, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %float1.000000e-05_706 = torch.constant.float 1.000000e-05
    %int1_707 = torch.constant.int 1
    %1123 = torch.aten.add.Scalar %1122, %float1.000000e-05_706, %int1_707 : !torch.vtensor<[1,?,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %1123, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %1124 = torch.aten.rsqrt %1123 : !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %1124, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %1125 = torch.aten.mul.Tensor %1119, %1124 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %1125, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int15_708 = torch.constant.int 15
    %1126 = torch.prims.convert_element_type %1125, %int15_708 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %1126, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %1127 = torch.aten.mul.Tensor %28, %1126 : !torch.vtensor<[4096],bf16>, !torch.vtensor<[1,?,4096],bf16> -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %1127, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %1128 = torch.aten.div.Tensor %1127, %29 : !torch.vtensor<[1,?,4096],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %1128, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %float-2.400000e02_709 = torch.constant.float -2.400000e+02
    %float2.400000e02_710 = torch.constant.float 2.400000e+02
    %1129 = torch.aten.clamp %1128, %float-2.400000e02_709, %float2.400000e02_710 : !torch.vtensor<[1,?,4096],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %1129, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int26_711 = torch.constant.int 26
    %1130 = torch.prims.convert_element_type %1129, %int26_711 : !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1130, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_712 = torch.constant.int -2
    %int-1_713 = torch.constant.int -1
    %1131 = torch.aten.transpose.int %30, %int-2_712, %int-1_713 : !torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %int4096_714 = torch.constant.int 4096
    %1132 = torch.prim.ListConstruct %564, %int4096_714 : (!torch.int, !torch.int) -> !torch.list<int>
    %1133 = torch.aten.view %1130, %1132 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1133, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %1134 = torch.aten.mm %1133, %1131 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,14336],f8E4M3FNUZ> -> !torch.vtensor<[?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1134, [%548], affine_map<()[s0] -> (s0 * 32, 14336)> : !torch.vtensor<[?,14336],f8E4M3FNUZ>
    %int1_715 = torch.constant.int 1
    %int14336_716 = torch.constant.int 14336
    %1135 = torch.prim.ListConstruct %int1_715, %564, %int14336_716 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1136 = torch.aten.view %1134, %1135 : !torch.vtensor<[?,14336],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1136, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    %int15_717 = torch.constant.int 15
    %1137 = torch.prims.convert_element_type %1136, %int15_717 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %1137, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %1138 = torch.aten.silu %1137 : !torch.vtensor<[1,?,14336],bf16> -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %1138, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %1139 = torch.aten.div.Tensor %1127, %31 : !torch.vtensor<[1,?,4096],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %1139, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %float-2.400000e02_718 = torch.constant.float -2.400000e+02
    %float2.400000e02_719 = torch.constant.float 2.400000e+02
    %1140 = torch.aten.clamp %1139, %float-2.400000e02_718, %float2.400000e02_719 : !torch.vtensor<[1,?,4096],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %1140, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int26_720 = torch.constant.int 26
    %1141 = torch.prims.convert_element_type %1140, %int26_720 : !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1141, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_721 = torch.constant.int -2
    %int-1_722 = torch.constant.int -1
    %1142 = torch.aten.transpose.int %32, %int-2_721, %int-1_722 : !torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %int4096_723 = torch.constant.int 4096
    %1143 = torch.prim.ListConstruct %564, %int4096_723 : (!torch.int, !torch.int) -> !torch.list<int>
    %1144 = torch.aten.view %1141, %1143 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1144, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %1145 = torch.aten.mm %1144, %1142 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,14336],f8E4M3FNUZ> -> !torch.vtensor<[?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1145, [%548], affine_map<()[s0] -> (s0 * 32, 14336)> : !torch.vtensor<[?,14336],f8E4M3FNUZ>
    %int1_724 = torch.constant.int 1
    %int14336_725 = torch.constant.int 14336
    %1146 = torch.prim.ListConstruct %int1_724, %564, %int14336_725 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1147 = torch.aten.view %1145, %1146 : !torch.vtensor<[?,14336],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1147, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    %int15_726 = torch.constant.int 15
    %1148 = torch.prims.convert_element_type %1147, %int15_726 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %1148, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %1149 = torch.aten.mul.Tensor %1138, %1148 : !torch.vtensor<[1,?,14336],bf16>, !torch.vtensor<[1,?,14336],bf16> -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %1149, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %1150 = torch.aten.div.Tensor %1149, %33 : !torch.vtensor<[1,?,14336],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %1150, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %float-2.400000e02_727 = torch.constant.float -2.400000e+02
    %float2.400000e02_728 = torch.constant.float 2.400000e+02
    %1151 = torch.aten.clamp %1150, %float-2.400000e02_727, %float2.400000e02_728 : !torch.vtensor<[1,?,14336],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %1151, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %int26_729 = torch.constant.int 26
    %1152 = torch.prims.convert_element_type %1151, %int26_729 : !torch.vtensor<[1,?,14336],bf16>, !torch.int -> !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1152, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    %int-2_730 = torch.constant.int -2
    %int-1_731 = torch.constant.int -1
    %1153 = torch.aten.transpose.int %34, %int-2_730, %int-1_731 : !torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %int1_732 = torch.constant.int 1
    %1154 = torch.aten.size.int %1136, %int1_732 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.int -> !torch.int
    %int14336_733 = torch.constant.int 14336
    %1155 = torch.prim.ListConstruct %1154, %int14336_733 : (!torch.int, !torch.int) -> !torch.list<int>
    %1156 = torch.aten.view %1152, %1155 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1156, [%548], affine_map<()[s0] -> (s0 * 32, 14336)> : !torch.vtensor<[?,14336],f8E4M3FNUZ>
    %1157 = torch.aten.mm %1156, %1153 : !torch.vtensor<[?,14336],f8E4M3FNUZ>, !torch.vtensor<[14336,4096],f8E4M3FNUZ> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1157, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %int1_734 = torch.constant.int 1
    %int4096_735 = torch.constant.int 4096
    %1158 = torch.prim.ListConstruct %int1_734, %1154, %int4096_735 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1159 = torch.aten.view %1157, %1158 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1159, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int15_736 = torch.constant.int 15
    %1160 = torch.prims.convert_element_type %1159, %int15_736 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %1160, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int1_737 = torch.constant.int 1
    %1161 = torch.aten.add.Tensor %1118, %1160, %int1_737 : !torch.vtensor<[1,?,4096],bf16>, !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %1161, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int6_738 = torch.constant.int 6
    %1162 = torch.prims.convert_element_type %1161, %int6_738 : !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %1162, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int2_739 = torch.constant.int 2
    %1163 = torch.aten.pow.Tensor_Scalar %1162, %int2_739 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %1163, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int-1_740 = torch.constant.int -1
    %1164 = torch.prim.ListConstruct %int-1_740 : (!torch.int) -> !torch.list<int>
    %true_741 = torch.constant.bool true
    %none_742 = torch.constant.none
    %1165 = torch.aten.mean.dim %1163, %1164, %true_741, %none_742 : !torch.vtensor<[1,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %1165, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %float1.000000e-05_743 = torch.constant.float 1.000000e-05
    %int1_744 = torch.constant.int 1
    %1166 = torch.aten.add.Scalar %1165, %float1.000000e-05_743, %int1_744 : !torch.vtensor<[1,?,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %1166, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %1167 = torch.aten.rsqrt %1166 : !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %1167, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %1168 = torch.aten.mul.Tensor %1162, %1167 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %1168, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int15_745 = torch.constant.int 15
    %1169 = torch.prims.convert_element_type %1168, %int15_745 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %1169, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %1170 = torch.aten.mul.Tensor %35, %1169 : !torch.vtensor<[4096],bf16>, !torch.vtensor<[1,?,4096],bf16> -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %1170, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %1171 = torch.aten.div.Tensor %1170, %36 : !torch.vtensor<[1,?,4096],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %1171, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %float-2.400000e02_746 = torch.constant.float -2.400000e+02
    %float2.400000e02_747 = torch.constant.float 2.400000e+02
    %1172 = torch.aten.clamp %1171, %float-2.400000e02_746, %float2.400000e02_747 : !torch.vtensor<[1,?,4096],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %1172, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int26_748 = torch.constant.int 26
    %1173 = torch.prims.convert_element_type %1172, %int26_748 : !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1173, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_749 = torch.constant.int -2
    %int-1_750 = torch.constant.int -1
    %1174 = torch.aten.transpose.int %37, %int-2_749, %int-1_750 : !torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %int4096_751 = torch.constant.int 4096
    %1175 = torch.prim.ListConstruct %564, %int4096_751 : (!torch.int, !torch.int) -> !torch.list<int>
    %1176 = torch.aten.view %1173, %1175 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1176, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %1177 = torch.aten.mm %1176, %1174 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1177, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %int1_752 = torch.constant.int 1
    %int4096_753 = torch.constant.int 4096
    %1178 = torch.prim.ListConstruct %int1_752, %564, %int4096_753 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1179 = torch.aten.view %1177, %1178 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1179, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int15_754 = torch.constant.int 15
    %1180 = torch.prims.convert_element_type %1179, %int15_754 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %1180, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %1181 = torch.aten.div.Tensor %1170, %38 : !torch.vtensor<[1,?,4096],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %1181, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %float-2.400000e02_755 = torch.constant.float -2.400000e+02
    %float2.400000e02_756 = torch.constant.float 2.400000e+02
    %1182 = torch.aten.clamp %1181, %float-2.400000e02_755, %float2.400000e02_756 : !torch.vtensor<[1,?,4096],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %1182, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int26_757 = torch.constant.int 26
    %1183 = torch.prims.convert_element_type %1182, %int26_757 : !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1183, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_758 = torch.constant.int -2
    %int-1_759 = torch.constant.int -1
    %1184 = torch.aten.transpose.int %39, %int-2_758, %int-1_759 : !torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,1024],f8E4M3FNUZ>
    %int4096_760 = torch.constant.int 4096
    %1185 = torch.prim.ListConstruct %564, %int4096_760 : (!torch.int, !torch.int) -> !torch.list<int>
    %1186 = torch.aten.view %1183, %1185 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1186, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %1187 = torch.aten.mm %1186, %1184 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,1024],f8E4M3FNUZ> -> !torch.vtensor<[?,1024],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1187, [%548], affine_map<()[s0] -> (s0 * 32, 1024)> : !torch.vtensor<[?,1024],f8E4M3FNUZ>
    %int1_761 = torch.constant.int 1
    %int1024_762 = torch.constant.int 1024
    %1188 = torch.prim.ListConstruct %int1_761, %564, %int1024_762 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1189 = torch.aten.view %1187, %1188 : !torch.vtensor<[?,1024],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1189, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
    %int15_763 = torch.constant.int 15
    %1190 = torch.prims.convert_element_type %1189, %int15_763 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,1024],bf16>
    torch.bind_symbolic_shape %1190, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],bf16>
    %1191 = torch.aten.div.Tensor %1170, %40 : !torch.vtensor<[1,?,4096],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %1191, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %float-2.400000e02_764 = torch.constant.float -2.400000e+02
    %float2.400000e02_765 = torch.constant.float 2.400000e+02
    %1192 = torch.aten.clamp %1191, %float-2.400000e02_764, %float2.400000e02_765 : !torch.vtensor<[1,?,4096],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %1192, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int26_766 = torch.constant.int 26
    %1193 = torch.prims.convert_element_type %1192, %int26_766 : !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1193, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_767 = torch.constant.int -2
    %int-1_768 = torch.constant.int -1
    %1194 = torch.aten.transpose.int %41, %int-2_767, %int-1_768 : !torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,1024],f8E4M3FNUZ>
    %int4096_769 = torch.constant.int 4096
    %1195 = torch.prim.ListConstruct %564, %int4096_769 : (!torch.int, !torch.int) -> !torch.list<int>
    %1196 = torch.aten.view %1193, %1195 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1196, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %1197 = torch.aten.mm %1196, %1194 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,1024],f8E4M3FNUZ> -> !torch.vtensor<[?,1024],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1197, [%548], affine_map<()[s0] -> (s0 * 32, 1024)> : !torch.vtensor<[?,1024],f8E4M3FNUZ>
    %int1_770 = torch.constant.int 1
    %int1024_771 = torch.constant.int 1024
    %1198 = torch.prim.ListConstruct %int1_770, %564, %int1024_771 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1199 = torch.aten.view %1197, %1198 : !torch.vtensor<[?,1024],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1199, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
    %int15_772 = torch.constant.int 15
    %1200 = torch.prims.convert_element_type %1199, %int15_772 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,1024],bf16>
    torch.bind_symbolic_shape %1200, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],bf16>
    %int1_773 = torch.constant.int 1
    %int32_774 = torch.constant.int 32
    %int128_775 = torch.constant.int 128
    %1201 = torch.prim.ListConstruct %int1_773, %564, %int32_774, %int128_775 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1202 = torch.aten.view %1180, %1201 : !torch.vtensor<[1,?,4096],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,32,128],bf16>
    torch.bind_symbolic_shape %1202, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
    %int1_776 = torch.constant.int 1
    %int8_777 = torch.constant.int 8
    %int128_778 = torch.constant.int 128
    %1203 = torch.prim.ListConstruct %int1_776, %564, %int8_777, %int128_778 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1204 = torch.aten.view %1190, %1203 : !torch.vtensor<[1,?,1024],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %1204, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %int1_779 = torch.constant.int 1
    %int8_780 = torch.constant.int 8
    %int128_781 = torch.constant.int 128
    %1205 = torch.prim.ListConstruct %int1_779, %564, %int8_780, %int128_781 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1206 = torch.aten.view %1200, %1205 : !torch.vtensor<[1,?,1024],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %1206, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %int131072_782 = torch.constant.int 131072
    %none_783 = torch.constant.none
    %none_784 = torch.constant.none
    %cpu_785 = torch.constant.device "cpu"
    %false_786 = torch.constant.bool false
    %1207 = torch.aten.arange %int131072_782, %none_783, %none_784, %cpu_785, %false_786 : !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[131072],si64>
    %int0_787 = torch.constant.int 0
    %int128_788 = torch.constant.int 128
    %int2_789 = torch.constant.int 2
    %int4_790 = torch.constant.int 4
    %none_791 = torch.constant.none
    %cpu_792 = torch.constant.device "cpu"
    %false_793 = torch.constant.bool false
    %1208 = torch.aten.arange.start_step %int0_787, %int128_788, %int2_789, %int4_790, %none_791, %cpu_792, %false_793 : !torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[64],si64>
    %int6_794 = torch.constant.int 6
    %1209 = torch.prims.convert_element_type %1208, %int6_794 : !torch.vtensor<[64],si64>, !torch.int -> !torch.vtensor<[64],f32>
    %int128_795 = torch.constant.int 128
    %1210 = torch.aten.div.Scalar %1209, %int128_795 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %float5.000000e05_796 = torch.constant.float 5.000000e+05
    %1211 = torch.aten.pow.Scalar %float5.000000e05_796, %1210 : !torch.float, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %1212 = torch.aten.reciprocal %1211 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %float1.000000e00_797 = torch.constant.float 1.000000e+00
    %1213 = torch.aten.mul.Scalar %1212, %float1.000000e00_797 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],f32>
    %1214 = torch.aten.reciprocal %1213 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %float6.283190e00_798 = torch.constant.float 6.2831853071795862
    %1215 = torch.aten.mul.Scalar %1214, %float6.283190e00_798 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],f32>
    %float8.192000e03_799 = torch.constant.float 8.192000e+03
    %1216 = torch.aten.gt.Scalar %1215, %float8.192000e03_799 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1>
    %int8_800 = torch.constant.int 8
    %1217 = torch.aten.div.Scalar %1213, %int8_800 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %1218 = torch.aten.where.self %1216, %1217, %1213 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %1219 = torch.aten.reciprocal %1215 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %int8192_801 = torch.constant.int 8192
    %1220 = torch.aten.mul.Scalar %1219, %int8192_801 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %int1_802 = torch.constant.int 1
    %int1_803 = torch.constant.int 1
    %1221 = torch.aten.sub.Scalar %1220, %int1_802, %int1_803 : !torch.vtensor<[64],f32>, !torch.int, !torch.int -> !torch.vtensor<[64],f32>
    %int3_804 = torch.constant.int 3
    %1222 = torch.aten.div.Scalar %1221, %int3_804 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %int1_805 = torch.constant.int 1
    %int1_806 = torch.constant.int 1
    %1223 = torch.aten.rsub.Scalar %1222, %int1_805, %int1_806 : !torch.vtensor<[64],f32>, !torch.int, !torch.int -> !torch.vtensor<[64],f32>
    %1224 = torch.aten.mul.Tensor %1223, %1218 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %int8_807 = torch.constant.int 8
    %1225 = torch.aten.div.Scalar %1224, %int8_807 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %1226 = torch.aten.mul.Tensor %1222, %1218 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %int1_808 = torch.constant.int 1
    %1227 = torch.aten.add.Tensor %1225, %1226, %int1_808 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %float2.048000e03_809 = torch.constant.float 2.048000e+03
    %1228 = torch.aten.lt.Scalar %1215, %float2.048000e03_809 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1>
    %1229 = torch.aten.bitwise_not %1228 : !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1>
    %float8.192000e03_810 = torch.constant.float 8.192000e+03
    %1230 = torch.aten.gt.Scalar %1215, %float8.192000e03_810 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1>
    %1231 = torch.aten.bitwise_not %1230 : !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1>
    %1232 = torch.aten.mul.Tensor %1229, %1231 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1>
    %1233 = torch.aten.where.self %1232, %1227, %1218 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %1234 = torch.prim.ListConstruct %1233, %1233 : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor>
    %int-1_811 = torch.constant.int -1
    %1235 = torch.aten.cat %1234, %int-1_811 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[128],f32>
    %int6_812 = torch.constant.int 6
    %1236 = torch.prims.convert_element_type %1207, %int6_812 : !torch.vtensor<[131072],si64>, !torch.int -> !torch.vtensor<[131072],f32>
    %int131072_813 = torch.constant.int 131072
    %int1_814 = torch.constant.int 1
    %1237 = torch.prim.ListConstruct %int131072_813, %int1_814 : (!torch.int, !torch.int) -> !torch.list<int>
    %1238 = torch.aten.view %1236, %1237 : !torch.vtensor<[131072],f32>, !torch.list<int> -> !torch.vtensor<[131072,1],f32>
    %1239 = torch.aten.mul.Tensor %1238, %1235 : !torch.vtensor<[131072,1],f32>, !torch.vtensor<[128],f32> -> !torch.vtensor<[131072,128],f32>
    %1240 = torch.aten.cos %1239 : !torch.vtensor<[131072,128],f32> -> !torch.vtensor<[131072,128],f32>
    %int15_815 = torch.constant.int 15
    %1241 = torch.prims.convert_element_type %1240, %int15_815 : !torch.vtensor<[131072,128],f32>, !torch.int -> !torch.vtensor<[131072,128],bf16>
    %1242 = torch.aten.sin %1239 : !torch.vtensor<[131072,128],f32> -> !torch.vtensor<[131072,128],f32>
    %int15_816 = torch.constant.int 15
    %1243 = torch.prims.convert_element_type %1242, %int15_816 : !torch.vtensor<[131072,128],f32>, !torch.int -> !torch.vtensor<[131072,128],bf16>
    %int1_817 = torch.constant.int 1
    %1244 = torch.aten.size.int %1179, %int1_817 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.int
    %int0_818 = torch.constant.int 0
    %1245 = torch.aten.add.int %int0_818, %1244 : !torch.int, !torch.int -> !torch.int
    %int0_819 = torch.constant.int 0
    %int0_820 = torch.constant.int 0
    %int1_821 = torch.constant.int 1
    %1246 = torch.aten.slice.Tensor %1241, %int0_819, %int0_820, %1245, %int1_821 : !torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %1246, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int1_822 = torch.constant.int 1
    %int0_823 = torch.constant.int 0
    %int9223372036854775807_824 = torch.constant.int 9223372036854775807
    %int1_825 = torch.constant.int 1
    %1247 = torch.aten.slice.Tensor %1246, %int1_822, %int0_823, %int9223372036854775807_824, %int1_825 : !torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %1247, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int0_826 = torch.constant.int 0
    %1248 = torch.aten.add.int %int0_826, %1244 : !torch.int, !torch.int -> !torch.int
    %int0_827 = torch.constant.int 0
    %int0_828 = torch.constant.int 0
    %int1_829 = torch.constant.int 1
    %1249 = torch.aten.slice.Tensor %1243, %int0_827, %int0_828, %1248, %int1_829 : !torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %1249, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int1_830 = torch.constant.int 1
    %int0_831 = torch.constant.int 0
    %int9223372036854775807_832 = torch.constant.int 9223372036854775807
    %int1_833 = torch.constant.int 1
    %1250 = torch.aten.slice.Tensor %1249, %int1_830, %int0_831, %int9223372036854775807_832, %int1_833 : !torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %1250, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int0_834 = torch.constant.int 0
    %1251 = torch.aten.unsqueeze %1247, %int0_834 : !torch.vtensor<[?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %1251, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int1_835 = torch.constant.int 1
    %int0_836 = torch.constant.int 0
    %int9223372036854775807_837 = torch.constant.int 9223372036854775807
    %int1_838 = torch.constant.int 1
    %1252 = torch.aten.slice.Tensor %1251, %int1_835, %int0_836, %int9223372036854775807_837, %int1_838 : !torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %1252, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int2_839 = torch.constant.int 2
    %1253 = torch.aten.unsqueeze %1252, %int2_839 : !torch.vtensor<[1,?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %1253, [%548], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int3_840 = torch.constant.int 3
    %int0_841 = torch.constant.int 0
    %int9223372036854775807_842 = torch.constant.int 9223372036854775807
    %int1_843 = torch.constant.int 1
    %1254 = torch.aten.slice.Tensor %1253, %int3_840, %int0_841, %int9223372036854775807_842, %int1_843 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %1254, [%548], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int0_844 = torch.constant.int 0
    %1255 = torch.aten.unsqueeze %1250, %int0_844 : !torch.vtensor<[?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %1255, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int1_845 = torch.constant.int 1
    %int0_846 = torch.constant.int 0
    %int9223372036854775807_847 = torch.constant.int 9223372036854775807
    %int1_848 = torch.constant.int 1
    %1256 = torch.aten.slice.Tensor %1255, %int1_845, %int0_846, %int9223372036854775807_847, %int1_848 : !torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %1256, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int2_849 = torch.constant.int 2
    %1257 = torch.aten.unsqueeze %1256, %int2_849 : !torch.vtensor<[1,?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %1257, [%548], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int3_850 = torch.constant.int 3
    %int0_851 = torch.constant.int 0
    %int9223372036854775807_852 = torch.constant.int 9223372036854775807
    %int1_853 = torch.constant.int 1
    %1258 = torch.aten.slice.Tensor %1257, %int3_850, %int0_851, %int9223372036854775807_852, %int1_853 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %1258, [%548], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int1_854 = torch.constant.int 1
    %int2_855 = torch.constant.int 2
    %1259 = torch.aten.transpose.int %1254, %int1_854, %int2_855 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,?,128],bf16>
    torch.bind_symbolic_shape %1259, [%548], affine_map<()[s0] -> (1, 1, s0 * 32, 128)> : !torch.vtensor<[1,1,?,128],bf16>
    %int1_856 = torch.constant.int 1
    %int1_857 = torch.constant.int 1
    %int1_858 = torch.constant.int 1
    %int1_859 = torch.constant.int 1
    %1260 = torch.prim.ListConstruct %int1_856, %int1_857, %int1_858, %int1_859 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1261 = torch.aten.repeat %1259, %1260 : !torch.vtensor<[1,1,?,128],bf16>, !torch.list<int> -> !torch.vtensor<[1,1,?,128],bf16>
    torch.bind_symbolic_shape %1261, [%548], affine_map<()[s0] -> (1, 1, s0 * 32, 128)> : !torch.vtensor<[1,1,?,128],bf16>
    %int1_860 = torch.constant.int 1
    %int2_861 = torch.constant.int 2
    %1262 = torch.aten.transpose.int %1258, %int1_860, %int2_861 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,?,128],bf16>
    torch.bind_symbolic_shape %1262, [%548], affine_map<()[s0] -> (1, 1, s0 * 32, 128)> : !torch.vtensor<[1,1,?,128],bf16>
    %int1_862 = torch.constant.int 1
    %int2_863 = torch.constant.int 2
    %1263 = torch.aten.transpose.int %1202, %int1_862, %int2_863 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
    torch.bind_symbolic_shape %1263, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %int1_864 = torch.constant.int 1
    %int1_865 = torch.constant.int 1
    %int1_866 = torch.constant.int 1
    %int1_867 = torch.constant.int 1
    %1264 = torch.prim.ListConstruct %int1_864, %int1_865, %int1_866, %int1_867 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1265 = torch.aten.repeat %1262, %1264 : !torch.vtensor<[1,1,?,128],bf16>, !torch.list<int> -> !torch.vtensor<[1,1,?,128],bf16>
    torch.bind_symbolic_shape %1265, [%548], affine_map<()[s0] -> (1, 1, s0 * 32, 128)> : !torch.vtensor<[1,1,?,128],bf16>
    %1266 = torch.aten.mul.Tensor %1263, %1261 : !torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,1,?,128],bf16> -> !torch.vtensor<[1,32,?,128],bf16>
    torch.bind_symbolic_shape %1266, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %int3_868 = torch.constant.int 3
    %int0_869 = torch.constant.int 0
    %int64_870 = torch.constant.int 64
    %int1_871 = torch.constant.int 1
    %1267 = torch.aten.slice.Tensor %1263, %int3_868, %int0_869, %int64_870, %int1_871 : !torch.vtensor<[1,32,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,64],bf16>
    torch.bind_symbolic_shape %1267, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 64)> : !torch.vtensor<[1,32,?,64],bf16>
    %int3_872 = torch.constant.int 3
    %int64_873 = torch.constant.int 64
    %int9223372036854775807_874 = torch.constant.int 9223372036854775807
    %int1_875 = torch.constant.int 1
    %1268 = torch.aten.slice.Tensor %1263, %int3_872, %int64_873, %int9223372036854775807_874, %int1_875 : !torch.vtensor<[1,32,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,64],bf16>
    torch.bind_symbolic_shape %1268, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 64)> : !torch.vtensor<[1,32,?,64],bf16>
    %1269 = torch.aten.neg %1268 : !torch.vtensor<[1,32,?,64],bf16> -> !torch.vtensor<[1,32,?,64],bf16>
    torch.bind_symbolic_shape %1269, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 64)> : !torch.vtensor<[1,32,?,64],bf16>
    %1270 = torch.prim.ListConstruct %1269, %1267 : (!torch.vtensor<[1,32,?,64],bf16>, !torch.vtensor<[1,32,?,64],bf16>) -> !torch.list<vtensor>
    %int-1_876 = torch.constant.int -1
    %1271 = torch.aten.cat %1270, %int-1_876 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
    torch.bind_symbolic_shape %1271, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %1272 = torch.aten.mul.Tensor %1271, %1265 : !torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,1,?,128],bf16> -> !torch.vtensor<[1,32,?,128],bf16>
    torch.bind_symbolic_shape %1272, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %int1_877 = torch.constant.int 1
    %1273 = torch.aten.add.Tensor %1266, %1272, %int1_877 : !torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,32,?,128],bf16>, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
    torch.bind_symbolic_shape %1273, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %int1_878 = torch.constant.int 1
    %int2_879 = torch.constant.int 2
    %1274 = torch.aten.transpose.int %1273, %int1_878, %int2_879 : !torch.vtensor<[1,32,?,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
    torch.bind_symbolic_shape %1274, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
    %int131072_880 = torch.constant.int 131072
    %none_881 = torch.constant.none
    %none_882 = torch.constant.none
    %cpu_883 = torch.constant.device "cpu"
    %false_884 = torch.constant.bool false
    %1275 = torch.aten.arange %int131072_880, %none_881, %none_882, %cpu_883, %false_884 : !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[131072],si64>
    %int0_885 = torch.constant.int 0
    %int128_886 = torch.constant.int 128
    %int2_887 = torch.constant.int 2
    %int4_888 = torch.constant.int 4
    %none_889 = torch.constant.none
    %cpu_890 = torch.constant.device "cpu"
    %false_891 = torch.constant.bool false
    %1276 = torch.aten.arange.start_step %int0_885, %int128_886, %int2_887, %int4_888, %none_889, %cpu_890, %false_891 : !torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[64],si64>
    %int6_892 = torch.constant.int 6
    %1277 = torch.prims.convert_element_type %1276, %int6_892 : !torch.vtensor<[64],si64>, !torch.int -> !torch.vtensor<[64],f32>
    %int128_893 = torch.constant.int 128
    %1278 = torch.aten.div.Scalar %1277, %int128_893 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %float5.000000e05_894 = torch.constant.float 5.000000e+05
    %1279 = torch.aten.pow.Scalar %float5.000000e05_894, %1278 : !torch.float, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %1280 = torch.aten.reciprocal %1279 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %float1.000000e00_895 = torch.constant.float 1.000000e+00
    %1281 = torch.aten.mul.Scalar %1280, %float1.000000e00_895 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],f32>
    %1282 = torch.aten.reciprocal %1281 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %float6.283190e00_896 = torch.constant.float 6.2831853071795862
    %1283 = torch.aten.mul.Scalar %1282, %float6.283190e00_896 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],f32>
    %float8.192000e03_897 = torch.constant.float 8.192000e+03
    %1284 = torch.aten.gt.Scalar %1283, %float8.192000e03_897 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1>
    %int8_898 = torch.constant.int 8
    %1285 = torch.aten.div.Scalar %1281, %int8_898 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %1286 = torch.aten.where.self %1284, %1285, %1281 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %1287 = torch.aten.reciprocal %1283 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %int8192_899 = torch.constant.int 8192
    %1288 = torch.aten.mul.Scalar %1287, %int8192_899 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %int1_900 = torch.constant.int 1
    %int1_901 = torch.constant.int 1
    %1289 = torch.aten.sub.Scalar %1288, %int1_900, %int1_901 : !torch.vtensor<[64],f32>, !torch.int, !torch.int -> !torch.vtensor<[64],f32>
    %int3_902 = torch.constant.int 3
    %1290 = torch.aten.div.Scalar %1289, %int3_902 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %int1_903 = torch.constant.int 1
    %int1_904 = torch.constant.int 1
    %1291 = torch.aten.rsub.Scalar %1290, %int1_903, %int1_904 : !torch.vtensor<[64],f32>, !torch.int, !torch.int -> !torch.vtensor<[64],f32>
    %1292 = torch.aten.mul.Tensor %1291, %1286 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %int8_905 = torch.constant.int 8
    %1293 = torch.aten.div.Scalar %1292, %int8_905 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %1294 = torch.aten.mul.Tensor %1290, %1286 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %int1_906 = torch.constant.int 1
    %1295 = torch.aten.add.Tensor %1293, %1294, %int1_906 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %float2.048000e03_907 = torch.constant.float 2.048000e+03
    %1296 = torch.aten.lt.Scalar %1283, %float2.048000e03_907 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1>
    %1297 = torch.aten.bitwise_not %1296 : !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1>
    %float8.192000e03_908 = torch.constant.float 8.192000e+03
    %1298 = torch.aten.gt.Scalar %1283, %float8.192000e03_908 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1>
    %1299 = torch.aten.bitwise_not %1298 : !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1>
    %1300 = torch.aten.mul.Tensor %1297, %1299 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1>
    %1301 = torch.aten.where.self %1300, %1295, %1286 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %1302 = torch.prim.ListConstruct %1301, %1301 : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor>
    %int-1_909 = torch.constant.int -1
    %1303 = torch.aten.cat %1302, %int-1_909 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[128],f32>
    %int6_910 = torch.constant.int 6
    %1304 = torch.prims.convert_element_type %1275, %int6_910 : !torch.vtensor<[131072],si64>, !torch.int -> !torch.vtensor<[131072],f32>
    %int131072_911 = torch.constant.int 131072
    %int1_912 = torch.constant.int 1
    %1305 = torch.prim.ListConstruct %int131072_911, %int1_912 : (!torch.int, !torch.int) -> !torch.list<int>
    %1306 = torch.aten.view %1304, %1305 : !torch.vtensor<[131072],f32>, !torch.list<int> -> !torch.vtensor<[131072,1],f32>
    %1307 = torch.aten.mul.Tensor %1306, %1303 : !torch.vtensor<[131072,1],f32>, !torch.vtensor<[128],f32> -> !torch.vtensor<[131072,128],f32>
    %1308 = torch.aten.cos %1307 : !torch.vtensor<[131072,128],f32> -> !torch.vtensor<[131072,128],f32>
    %int15_913 = torch.constant.int 15
    %1309 = torch.prims.convert_element_type %1308, %int15_913 : !torch.vtensor<[131072,128],f32>, !torch.int -> !torch.vtensor<[131072,128],bf16>
    %1310 = torch.aten.sin %1307 : !torch.vtensor<[131072,128],f32> -> !torch.vtensor<[131072,128],f32>
    %int15_914 = torch.constant.int 15
    %1311 = torch.prims.convert_element_type %1310, %int15_914 : !torch.vtensor<[131072,128],f32>, !torch.int -> !torch.vtensor<[131072,128],bf16>
    %int1_915 = torch.constant.int 1
    %1312 = torch.aten.size.int %1189, %int1_915 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.int
    %int0_916 = torch.constant.int 0
    %1313 = torch.aten.add.int %int0_916, %1312 : !torch.int, !torch.int -> !torch.int
    %int0_917 = torch.constant.int 0
    %int0_918 = torch.constant.int 0
    %int1_919 = torch.constant.int 1
    %1314 = torch.aten.slice.Tensor %1309, %int0_917, %int0_918, %1313, %int1_919 : !torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %1314, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int1_920 = torch.constant.int 1
    %int0_921 = torch.constant.int 0
    %int9223372036854775807_922 = torch.constant.int 9223372036854775807
    %int1_923 = torch.constant.int 1
    %1315 = torch.aten.slice.Tensor %1314, %int1_920, %int0_921, %int9223372036854775807_922, %int1_923 : !torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %1315, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int0_924 = torch.constant.int 0
    %1316 = torch.aten.add.int %int0_924, %1312 : !torch.int, !torch.int -> !torch.int
    %int0_925 = torch.constant.int 0
    %int0_926 = torch.constant.int 0
    %int1_927 = torch.constant.int 1
    %1317 = torch.aten.slice.Tensor %1311, %int0_925, %int0_926, %1316, %int1_927 : !torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %1317, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int1_928 = torch.constant.int 1
    %int0_929 = torch.constant.int 0
    %int9223372036854775807_930 = torch.constant.int 9223372036854775807
    %int1_931 = torch.constant.int 1
    %1318 = torch.aten.slice.Tensor %1317, %int1_928, %int0_929, %int9223372036854775807_930, %int1_931 : !torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %1318, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int0_932 = torch.constant.int 0
    %1319 = torch.aten.unsqueeze %1315, %int0_932 : !torch.vtensor<[?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %1319, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int1_933 = torch.constant.int 1
    %int0_934 = torch.constant.int 0
    %int9223372036854775807_935 = torch.constant.int 9223372036854775807
    %int1_936 = torch.constant.int 1
    %1320 = torch.aten.slice.Tensor %1319, %int1_933, %int0_934, %int9223372036854775807_935, %int1_936 : !torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %1320, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int2_937 = torch.constant.int 2
    %1321 = torch.aten.unsqueeze %1320, %int2_937 : !torch.vtensor<[1,?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %1321, [%548], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int3_938 = torch.constant.int 3
    %int0_939 = torch.constant.int 0
    %int9223372036854775807_940 = torch.constant.int 9223372036854775807
    %int1_941 = torch.constant.int 1
    %1322 = torch.aten.slice.Tensor %1321, %int3_938, %int0_939, %int9223372036854775807_940, %int1_941 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %1322, [%548], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int0_942 = torch.constant.int 0
    %1323 = torch.aten.unsqueeze %1318, %int0_942 : !torch.vtensor<[?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %1323, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int1_943 = torch.constant.int 1
    %int0_944 = torch.constant.int 0
    %int9223372036854775807_945 = torch.constant.int 9223372036854775807
    %int1_946 = torch.constant.int 1
    %1324 = torch.aten.slice.Tensor %1323, %int1_943, %int0_944, %int9223372036854775807_945, %int1_946 : !torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %1324, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int2_947 = torch.constant.int 2
    %1325 = torch.aten.unsqueeze %1324, %int2_947 : !torch.vtensor<[1,?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %1325, [%548], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int3_948 = torch.constant.int 3
    %int0_949 = torch.constant.int 0
    %int9223372036854775807_950 = torch.constant.int 9223372036854775807
    %int1_951 = torch.constant.int 1
    %1326 = torch.aten.slice.Tensor %1325, %int3_948, %int0_949, %int9223372036854775807_950, %int1_951 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %1326, [%548], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int1_952 = torch.constant.int 1
    %int2_953 = torch.constant.int 2
    %1327 = torch.aten.transpose.int %1322, %int1_952, %int2_953 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,?,128],bf16>
    torch.bind_symbolic_shape %1327, [%548], affine_map<()[s0] -> (1, 1, s0 * 32, 128)> : !torch.vtensor<[1,1,?,128],bf16>
    %int1_954 = torch.constant.int 1
    %int1_955 = torch.constant.int 1
    %int1_956 = torch.constant.int 1
    %int1_957 = torch.constant.int 1
    %1328 = torch.prim.ListConstruct %int1_954, %int1_955, %int1_956, %int1_957 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1329 = torch.aten.repeat %1327, %1328 : !torch.vtensor<[1,1,?,128],bf16>, !torch.list<int> -> !torch.vtensor<[1,1,?,128],bf16>
    torch.bind_symbolic_shape %1329, [%548], affine_map<()[s0] -> (1, 1, s0 * 32, 128)> : !torch.vtensor<[1,1,?,128],bf16>
    %int1_958 = torch.constant.int 1
    %int2_959 = torch.constant.int 2
    %1330 = torch.aten.transpose.int %1326, %int1_958, %int2_959 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,?,128],bf16>
    torch.bind_symbolic_shape %1330, [%548], affine_map<()[s0] -> (1, 1, s0 * 32, 128)> : !torch.vtensor<[1,1,?,128],bf16>
    %int1_960 = torch.constant.int 1
    %int2_961 = torch.constant.int 2
    %1331 = torch.aten.transpose.int %1204, %int1_960, %int2_961 : !torch.vtensor<[1,?,8,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,8,?,128],bf16>
    torch.bind_symbolic_shape %1331, [%548], affine_map<()[s0] -> (1, 8, s0 * 32, 128)> : !torch.vtensor<[1,8,?,128],bf16>
    %int1_962 = torch.constant.int 1
    %int1_963 = torch.constant.int 1
    %int1_964 = torch.constant.int 1
    %int1_965 = torch.constant.int 1
    %1332 = torch.prim.ListConstruct %int1_962, %int1_963, %int1_964, %int1_965 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1333 = torch.aten.repeat %1330, %1332 : !torch.vtensor<[1,1,?,128],bf16>, !torch.list<int> -> !torch.vtensor<[1,1,?,128],bf16>
    torch.bind_symbolic_shape %1333, [%548], affine_map<()[s0] -> (1, 1, s0 * 32, 128)> : !torch.vtensor<[1,1,?,128],bf16>
    %1334 = torch.aten.mul.Tensor %1331, %1329 : !torch.vtensor<[1,8,?,128],bf16>, !torch.vtensor<[1,1,?,128],bf16> -> !torch.vtensor<[1,8,?,128],bf16>
    torch.bind_symbolic_shape %1334, [%548], affine_map<()[s0] -> (1, 8, s0 * 32, 128)> : !torch.vtensor<[1,8,?,128],bf16>
    %int3_966 = torch.constant.int 3
    %int0_967 = torch.constant.int 0
    %int64_968 = torch.constant.int 64
    %int1_969 = torch.constant.int 1
    %1335 = torch.aten.slice.Tensor %1331, %int3_966, %int0_967, %int64_968, %int1_969 : !torch.vtensor<[1,8,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,8,?,64],bf16>
    torch.bind_symbolic_shape %1335, [%548], affine_map<()[s0] -> (1, 8, s0 * 32, 64)> : !torch.vtensor<[1,8,?,64],bf16>
    %int3_970 = torch.constant.int 3
    %int64_971 = torch.constant.int 64
    %int9223372036854775807_972 = torch.constant.int 9223372036854775807
    %int1_973 = torch.constant.int 1
    %1336 = torch.aten.slice.Tensor %1331, %int3_970, %int64_971, %int9223372036854775807_972, %int1_973 : !torch.vtensor<[1,8,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,8,?,64],bf16>
    torch.bind_symbolic_shape %1336, [%548], affine_map<()[s0] -> (1, 8, s0 * 32, 64)> : !torch.vtensor<[1,8,?,64],bf16>
    %1337 = torch.aten.neg %1336 : !torch.vtensor<[1,8,?,64],bf16> -> !torch.vtensor<[1,8,?,64],bf16>
    torch.bind_symbolic_shape %1337, [%548], affine_map<()[s0] -> (1, 8, s0 * 32, 64)> : !torch.vtensor<[1,8,?,64],bf16>
    %1338 = torch.prim.ListConstruct %1337, %1335 : (!torch.vtensor<[1,8,?,64],bf16>, !torch.vtensor<[1,8,?,64],bf16>) -> !torch.list<vtensor>
    %int-1_974 = torch.constant.int -1
    %1339 = torch.aten.cat %1338, %int-1_974 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,8,?,128],bf16>
    torch.bind_symbolic_shape %1339, [%548], affine_map<()[s0] -> (1, 8, s0 * 32, 128)> : !torch.vtensor<[1,8,?,128],bf16>
    %1340 = torch.aten.mul.Tensor %1339, %1333 : !torch.vtensor<[1,8,?,128],bf16>, !torch.vtensor<[1,1,?,128],bf16> -> !torch.vtensor<[1,8,?,128],bf16>
    torch.bind_symbolic_shape %1340, [%548], affine_map<()[s0] -> (1, 8, s0 * 32, 128)> : !torch.vtensor<[1,8,?,128],bf16>
    %int1_975 = torch.constant.int 1
    %1341 = torch.aten.add.Tensor %1334, %1340, %int1_975 : !torch.vtensor<[1,8,?,128],bf16>, !torch.vtensor<[1,8,?,128],bf16>, !torch.int -> !torch.vtensor<[1,8,?,128],bf16>
    torch.bind_symbolic_shape %1341, [%548], affine_map<()[s0] -> (1, 8, s0 * 32, 128)> : !torch.vtensor<[1,8,?,128],bf16>
    %int1_976 = torch.constant.int 1
    %int2_977 = torch.constant.int 2
    %1342 = torch.aten.transpose.int %1341, %int1_976, %int2_977 : !torch.vtensor<[1,8,?,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %1342, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %1343 = torch.aten.div.Tensor %1342, %42 : !torch.vtensor<[1,?,8,128],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %1343, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %float-2.400000e02_978 = torch.constant.float -2.400000e+02
    %float2.400000e02_979 = torch.constant.float 2.400000e+02
    %1344 = torch.aten.clamp %1343, %float-2.400000e02_978, %float2.400000e02_979 : !torch.vtensor<[1,?,8,128],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %1344, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %int26_980 = torch.constant.int 26
    %1345 = torch.prims.convert_element_type %1344, %int26_980 : !torch.vtensor<[1,?,8,128],bf16>, !torch.int -> !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1345, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
    %1346 = torch.aten.div.Tensor %1206, %42 : !torch.vtensor<[1,?,8,128],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %1346, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %float-2.400000e02_981 = torch.constant.float -2.400000e+02
    %float2.400000e02_982 = torch.constant.float 2.400000e+02
    %1347 = torch.aten.clamp %1346, %float-2.400000e02_981, %float2.400000e02_982 : !torch.vtensor<[1,?,8,128],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %1347, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %int26_983 = torch.constant.int 26
    %1348 = torch.prims.convert_element_type %1347, %int26_983 : !torch.vtensor<[1,?,8,128],bf16>, !torch.int -> !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1348, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
    %int64_984 = torch.constant.int 64
    %1349 = torch.aten.mul.Scalar %arg2, %int64_984 : !torch.vtensor<[1,?],si64>, !torch.int -> !torch.vtensor<[1,?],si64>
    torch.bind_symbolic_shape %1349, [%548], affine_map<()[s0] -> (1, s0)> : !torch.vtensor<[1,?],si64>
    %int4_985 = torch.constant.int 4
    %int1_986 = torch.constant.int 1
    %1350 = torch.aten.add.Scalar %1349, %int4_985, %int1_986 : !torch.vtensor<[1,?],si64>, !torch.int, !torch.int -> !torch.vtensor<[1,?],si64>
    torch.bind_symbolic_shape %1350, [%548], affine_map<()[s0] -> (1, s0)> : !torch.vtensor<[1,?],si64>
    %int1_987 = torch.constant.int 1
    %int32_988 = torch.constant.int 32
    %int8_989 = torch.constant.int 8
    %int128_990 = torch.constant.int 128
    %1351 = torch.prim.ListConstruct %int1_987, %748, %int32_988, %int8_989, %int128_990 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1352 = torch.aten.view %1345, %1351 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1352, [%548], affine_map<()[s0] -> (1, s0, 32, 8, 128)> : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
    %int32_991 = torch.constant.int 32
    %int8_992 = torch.constant.int 8
    %int128_993 = torch.constant.int 128
    %1353 = torch.prim.ListConstruct %748, %int32_991, %int8_992, %int128_993 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1354 = torch.aten.view %1352, %1353 : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1354, [%548], affine_map<()[s0] -> (s0, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %1355 = torch.prim.ListConstruct %748 : (!torch.int) -> !torch.list<int>
    %1356 = torch.aten.view %1350, %1355 : !torch.vtensor<[1,?],si64>, !torch.list<int> -> !torch.vtensor<[?],si64>
    torch.bind_symbolic_shape %1356, [%548], affine_map<()[s0] -> (s0)> : !torch.vtensor<[?],si64>
    %int32_994 = torch.constant.int 32
    %int2_995 = torch.constant.int 2
    %int32_996 = torch.constant.int 32
    %int8_997 = torch.constant.int 8
    %int128_998 = torch.constant.int 128
    %1357 = torch.prim.ListConstruct %739, %int32_994, %int2_995, %int32_996, %int8_997, %int128_998 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1358 = torch.aten.view %1081, %1357 : !torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1358, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    %int32_999 = torch.constant.int 32
    %1359 = torch.aten.mul.int %739, %int32_999 : !torch.int, !torch.int -> !torch.int
    %int2_1000 = torch.constant.int 2
    %1360 = torch.aten.mul.int %1359, %int2_1000 : !torch.int, !torch.int -> !torch.int
    %int32_1001 = torch.constant.int 32
    %int8_1002 = torch.constant.int 8
    %int128_1003 = torch.constant.int 128
    %1361 = torch.prim.ListConstruct %1360, %int32_1001, %int8_1002, %int128_1003 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1362 = torch.aten.view %1358, %1361 : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1362, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %1363 = torch.prim.ListConstruct %1356 : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %false_1004 = torch.constant.bool false
    %1364 = torch.aten.index_put %1362, %1363, %1354, %false_1004 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1364, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %int32_1005 = torch.constant.int 32
    %int2_1006 = torch.constant.int 2
    %int32_1007 = torch.constant.int 32
    %int8_1008 = torch.constant.int 8
    %int128_1009 = torch.constant.int 128
    %1365 = torch.prim.ListConstruct %739, %int32_1005, %int2_1006, %int32_1007, %int8_1008, %int128_1009 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1366 = torch.aten.view %1364, %1365 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1366, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    %int2097152_1010 = torch.constant.int 2097152
    %1367 = torch.prim.ListConstruct %739, %int2097152_1010 : (!torch.int, !torch.int) -> !torch.list<int>
    %1368 = torch.aten.view %1366, %1367 : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1368, [%549], affine_map<()[s0] -> (s0, 2097152)> : !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    %int32_1011 = torch.constant.int 32
    %int2_1012 = torch.constant.int 2
    %int32_1013 = torch.constant.int 32
    %int8_1014 = torch.constant.int 8
    %int128_1015 = torch.constant.int 128
    %1369 = torch.prim.ListConstruct %739, %int32_1011, %int2_1012, %int32_1013, %int8_1014, %int128_1015 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1370 = torch.aten.view %1368, %1369 : !torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1370, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    %int32_1016 = torch.constant.int 32
    %int8_1017 = torch.constant.int 8
    %int128_1018 = torch.constant.int 128
    %1371 = torch.prim.ListConstruct %1360, %int32_1016, %int8_1017, %int128_1018 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1372 = torch.aten.view %1370, %1371 : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1372, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %int1_1019 = torch.constant.int 1
    %int32_1020 = torch.constant.int 32
    %int8_1021 = torch.constant.int 8
    %int128_1022 = torch.constant.int 128
    %1373 = torch.prim.ListConstruct %int1_1019, %748, %int32_1020, %int8_1021, %int128_1022 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1374 = torch.aten.view %1348, %1373 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1374, [%548], affine_map<()[s0] -> (1, s0, 32, 8, 128)> : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
    %int32_1023 = torch.constant.int 32
    %int8_1024 = torch.constant.int 8
    %int128_1025 = torch.constant.int 128
    %1375 = torch.prim.ListConstruct %748, %int32_1023, %int8_1024, %int128_1025 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1376 = torch.aten.view %1374, %1375 : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1376, [%548], affine_map<()[s0] -> (s0, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %int1_1026 = torch.constant.int 1
    %int1_1027 = torch.constant.int 1
    %1377 = torch.aten.add.Scalar %1350, %int1_1026, %int1_1027 : !torch.vtensor<[1,?],si64>, !torch.int, !torch.int -> !torch.vtensor<[1,?],si64>
    torch.bind_symbolic_shape %1377, [%548], affine_map<()[s0] -> (1, s0)> : !torch.vtensor<[1,?],si64>
    %1378 = torch.prim.ListConstruct %748 : (!torch.int) -> !torch.list<int>
    %1379 = torch.aten.view %1377, %1378 : !torch.vtensor<[1,?],si64>, !torch.list<int> -> !torch.vtensor<[?],si64>
    torch.bind_symbolic_shape %1379, [%548], affine_map<()[s0] -> (s0)> : !torch.vtensor<[?],si64>
    %1380 = torch.prim.ListConstruct %1379 : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %false_1028 = torch.constant.bool false
    %1381 = torch.aten.index_put %1372, %1380, %1376, %false_1028 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1381, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %int32_1029 = torch.constant.int 32
    %int2_1030 = torch.constant.int 2
    %int32_1031 = torch.constant.int 32
    %int8_1032 = torch.constant.int 8
    %int128_1033 = torch.constant.int 128
    %1382 = torch.prim.ListConstruct %739, %int32_1029, %int2_1030, %int32_1031, %int8_1032, %int128_1033 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1383 = torch.aten.view %1381, %1382 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1383, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    %int2097152_1034 = torch.constant.int 2097152
    %1384 = torch.prim.ListConstruct %739, %int2097152_1034 : (!torch.int, !torch.int) -> !torch.list<int>
    %1385 = torch.aten.view %1383, %1384 : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1385, [%549], affine_map<()[s0] -> (s0, 2097152)> : !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    %int-2_1035 = torch.constant.int -2
    %1386 = torch.aten.unsqueeze %1345, %int-2_1035 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1386, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 1, 128)> : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
    %int1_1036 = torch.constant.int 1
    %int8_1037 = torch.constant.int 8
    %int4_1038 = torch.constant.int 4
    %int128_1039 = torch.constant.int 128
    %1387 = torch.prim.ListConstruct %int1_1036, %1312, %int8_1037, %int4_1038, %int128_1039 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %false_1040 = torch.constant.bool false
    %1388 = torch.aten.expand %1386, %1387, %false_1040 : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1388, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    %int0_1041 = torch.constant.int 0
    %1389 = torch.aten.clone %1388, %int0_1041 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1389, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    %int1_1042 = torch.constant.int 1
    %int32_1043 = torch.constant.int 32
    %int128_1044 = torch.constant.int 128
    %1390 = torch.prim.ListConstruct %int1_1042, %1312, %int32_1043, %int128_1044 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1391 = torch.aten._unsafe_view %1389, %1390 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1391, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
    %int-2_1045 = torch.constant.int -2
    %1392 = torch.aten.unsqueeze %1348, %int-2_1045 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1392, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 1, 128)> : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
    %int1_1046 = torch.constant.int 1
    %1393 = torch.aten.size.int %1199, %int1_1046 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.int
    %int1_1047 = torch.constant.int 1
    %int8_1048 = torch.constant.int 8
    %int4_1049 = torch.constant.int 4
    %int128_1050 = torch.constant.int 128
    %1394 = torch.prim.ListConstruct %int1_1047, %1393, %int8_1048, %int4_1049, %int128_1050 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %false_1051 = torch.constant.bool false
    %1395 = torch.aten.expand %1392, %1394, %false_1051 : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1395, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    %int0_1052 = torch.constant.int 0
    %1396 = torch.aten.clone %1395, %int0_1052 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1396, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    %int1_1053 = torch.constant.int 1
    %int32_1054 = torch.constant.int 32
    %int128_1055 = torch.constant.int 128
    %1397 = torch.prim.ListConstruct %int1_1053, %1393, %int32_1054, %int128_1055 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1398 = torch.aten._unsafe_view %1396, %1397 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1398, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
    %int6_1056 = torch.constant.int 6
    %1399 = torch.prims.convert_element_type %1391, %int6_1056 : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,32,128],f32>
    torch.bind_symbolic_shape %1399, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
    %1400 = torch.aten.mul.Tensor %1399, %42 : !torch.vtensor<[1,?,32,128],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,32,128],f32>
    torch.bind_symbolic_shape %1400, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
    %int15_1057 = torch.constant.int 15
    %1401 = torch.prims.convert_element_type %1400, %int15_1057 : !torch.vtensor<[1,?,32,128],f32>, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
    torch.bind_symbolic_shape %1401, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
    %int6_1058 = torch.constant.int 6
    %1402 = torch.prims.convert_element_type %1398, %int6_1058 : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,32,128],f32>
    torch.bind_symbolic_shape %1402, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
    %1403 = torch.aten.mul.Tensor %1402, %42 : !torch.vtensor<[1,?,32,128],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,32,128],f32>
    torch.bind_symbolic_shape %1403, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
    %int15_1059 = torch.constant.int 15
    %1404 = torch.prims.convert_element_type %1403, %int15_1059 : !torch.vtensor<[1,?,32,128],f32>, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
    torch.bind_symbolic_shape %1404, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
    %int1_1060 = torch.constant.int 1
    %int2_1061 = torch.constant.int 2
    %1405 = torch.aten.transpose.int %1274, %int1_1060, %int2_1061 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
    torch.bind_symbolic_shape %1405, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %int1_1062 = torch.constant.int 1
    %int2_1063 = torch.constant.int 2
    %1406 = torch.aten.transpose.int %1401, %int1_1062, %int2_1063 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
    torch.bind_symbolic_shape %1406, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %int1_1064 = torch.constant.int 1
    %int2_1065 = torch.constant.int 2
    %1407 = torch.aten.transpose.int %1404, %int1_1064, %int2_1065 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
    torch.bind_symbolic_shape %1407, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %float0.000000e00_1066 = torch.constant.float 0.000000e+00
    %true_1067 = torch.constant.bool true
    %none_1068 = torch.constant.none
    %none_1069 = torch.constant.none
    %1408:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%1405, %1406, %1407, %float0.000000e00_1066, %true_1067, %none_1068, %none_1069) : (!torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,32,?,128],bf16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,32,?],f32>) 
    torch.bind_symbolic_shape %1408#0, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %int1_1070 = torch.constant.int 1
    %int2_1071 = torch.constant.int 2
    %1409 = torch.aten.transpose.int %1408#0, %int1_1070, %int2_1071 : !torch.vtensor<[1,32,?,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
    torch.bind_symbolic_shape %1409, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
    %int1_1072 = torch.constant.int 1
    %int4096_1073 = torch.constant.int 4096
    %1410 = torch.prim.ListConstruct %int1_1072, %1244, %int4096_1073 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1411 = torch.aten.view %1409, %1410 : !torch.vtensor<[1,?,32,128],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %1411, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %1412 = torch.aten.div.Tensor %1411, %43 : !torch.vtensor<[1,?,4096],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %1412, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %float-2.400000e02_1074 = torch.constant.float -2.400000e+02
    %float2.400000e02_1075 = torch.constant.float 2.400000e+02
    %1413 = torch.aten.clamp %1412, %float-2.400000e02_1074, %float2.400000e02_1075 : !torch.vtensor<[1,?,4096],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %1413, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int26_1076 = torch.constant.int 26
    %1414 = torch.prims.convert_element_type %1413, %int26_1076 : !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1414, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_1077 = torch.constant.int -2
    %int-1_1078 = torch.constant.int -1
    %1415 = torch.aten.transpose.int %44, %int-2_1077, %int-1_1078 : !torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %int4096_1079 = torch.constant.int 4096
    %1416 = torch.prim.ListConstruct %1244, %int4096_1079 : (!torch.int, !torch.int) -> !torch.list<int>
    %1417 = torch.aten.view %1414, %1416 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1417, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %1418 = torch.aten.mm %1417, %1415 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1418, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %int1_1080 = torch.constant.int 1
    %int4096_1081 = torch.constant.int 4096
    %1419 = torch.prim.ListConstruct %int1_1080, %1244, %int4096_1081 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1420 = torch.aten.view %1418, %1419 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1420, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int15_1082 = torch.constant.int 15
    %1421 = torch.prims.convert_element_type %1420, %int15_1082 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %1421, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int1_1083 = torch.constant.int 1
    %1422 = torch.aten.add.Tensor %1161, %1421, %int1_1083 : !torch.vtensor<[1,?,4096],bf16>, !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %1422, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int6_1084 = torch.constant.int 6
    %1423 = torch.prims.convert_element_type %1422, %int6_1084 : !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %1423, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int2_1085 = torch.constant.int 2
    %1424 = torch.aten.pow.Tensor_Scalar %1423, %int2_1085 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %1424, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int-1_1086 = torch.constant.int -1
    %1425 = torch.prim.ListConstruct %int-1_1086 : (!torch.int) -> !torch.list<int>
    %true_1087 = torch.constant.bool true
    %none_1088 = torch.constant.none
    %1426 = torch.aten.mean.dim %1424, %1425, %true_1087, %none_1088 : !torch.vtensor<[1,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %1426, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %float1.000000e-05_1089 = torch.constant.float 1.000000e-05
    %int1_1090 = torch.constant.int 1
    %1427 = torch.aten.add.Scalar %1426, %float1.000000e-05_1089, %int1_1090 : !torch.vtensor<[1,?,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %1427, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %1428 = torch.aten.rsqrt %1427 : !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %1428, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %1429 = torch.aten.mul.Tensor %1423, %1428 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %1429, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int15_1091 = torch.constant.int 15
    %1430 = torch.prims.convert_element_type %1429, %int15_1091 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %1430, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %1431 = torch.aten.mul.Tensor %45, %1430 : !torch.vtensor<[4096],bf16>, !torch.vtensor<[1,?,4096],bf16> -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %1431, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %1432 = torch.aten.div.Tensor %1431, %46 : !torch.vtensor<[1,?,4096],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %1432, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %float-2.400000e02_1092 = torch.constant.float -2.400000e+02
    %float2.400000e02_1093 = torch.constant.float 2.400000e+02
    %1433 = torch.aten.clamp %1432, %float-2.400000e02_1092, %float2.400000e02_1093 : !torch.vtensor<[1,?,4096],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %1433, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int26_1094 = torch.constant.int 26
    %1434 = torch.prims.convert_element_type %1433, %int26_1094 : !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1434, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_1095 = torch.constant.int -2
    %int-1_1096 = torch.constant.int -1
    %1435 = torch.aten.transpose.int %47, %int-2_1095, %int-1_1096 : !torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %int4096_1097 = torch.constant.int 4096
    %1436 = torch.prim.ListConstruct %564, %int4096_1097 : (!torch.int, !torch.int) -> !torch.list<int>
    %1437 = torch.aten.view %1434, %1436 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1437, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %1438 = torch.aten.mm %1437, %1435 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,14336],f8E4M3FNUZ> -> !torch.vtensor<[?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1438, [%548], affine_map<()[s0] -> (s0 * 32, 14336)> : !torch.vtensor<[?,14336],f8E4M3FNUZ>
    %int1_1098 = torch.constant.int 1
    %int14336_1099 = torch.constant.int 14336
    %1439 = torch.prim.ListConstruct %int1_1098, %564, %int14336_1099 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1440 = torch.aten.view %1438, %1439 : !torch.vtensor<[?,14336],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1440, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    %int15_1100 = torch.constant.int 15
    %1441 = torch.prims.convert_element_type %1440, %int15_1100 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %1441, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %1442 = torch.aten.silu %1441 : !torch.vtensor<[1,?,14336],bf16> -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %1442, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %1443 = torch.aten.div.Tensor %1431, %48 : !torch.vtensor<[1,?,4096],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %1443, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %float-2.400000e02_1101 = torch.constant.float -2.400000e+02
    %float2.400000e02_1102 = torch.constant.float 2.400000e+02
    %1444 = torch.aten.clamp %1443, %float-2.400000e02_1101, %float2.400000e02_1102 : !torch.vtensor<[1,?,4096],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %1444, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int26_1103 = torch.constant.int 26
    %1445 = torch.prims.convert_element_type %1444, %int26_1103 : !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1445, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_1104 = torch.constant.int -2
    %int-1_1105 = torch.constant.int -1
    %1446 = torch.aten.transpose.int %49, %int-2_1104, %int-1_1105 : !torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %int4096_1106 = torch.constant.int 4096
    %1447 = torch.prim.ListConstruct %564, %int4096_1106 : (!torch.int, !torch.int) -> !torch.list<int>
    %1448 = torch.aten.view %1445, %1447 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1448, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %1449 = torch.aten.mm %1448, %1446 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,14336],f8E4M3FNUZ> -> !torch.vtensor<[?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1449, [%548], affine_map<()[s0] -> (s0 * 32, 14336)> : !torch.vtensor<[?,14336],f8E4M3FNUZ>
    %int1_1107 = torch.constant.int 1
    %int14336_1108 = torch.constant.int 14336
    %1450 = torch.prim.ListConstruct %int1_1107, %564, %int14336_1108 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1451 = torch.aten.view %1449, %1450 : !torch.vtensor<[?,14336],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1451, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    %int15_1109 = torch.constant.int 15
    %1452 = torch.prims.convert_element_type %1451, %int15_1109 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %1452, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %1453 = torch.aten.mul.Tensor %1442, %1452 : !torch.vtensor<[1,?,14336],bf16>, !torch.vtensor<[1,?,14336],bf16> -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %1453, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %1454 = torch.aten.div.Tensor %1453, %50 : !torch.vtensor<[1,?,14336],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %1454, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %float-2.400000e02_1110 = torch.constant.float -2.400000e+02
    %float2.400000e02_1111 = torch.constant.float 2.400000e+02
    %1455 = torch.aten.clamp %1454, %float-2.400000e02_1110, %float2.400000e02_1111 : !torch.vtensor<[1,?,14336],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %1455, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %int26_1112 = torch.constant.int 26
    %1456 = torch.prims.convert_element_type %1455, %int26_1112 : !torch.vtensor<[1,?,14336],bf16>, !torch.int -> !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1456, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    %int-2_1113 = torch.constant.int -2
    %int-1_1114 = torch.constant.int -1
    %1457 = torch.aten.transpose.int %51, %int-2_1113, %int-1_1114 : !torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %int1_1115 = torch.constant.int 1
    %1458 = torch.aten.size.int %1440, %int1_1115 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.int -> !torch.int
    %int14336_1116 = torch.constant.int 14336
    %1459 = torch.prim.ListConstruct %1458, %int14336_1116 : (!torch.int, !torch.int) -> !torch.list<int>
    %1460 = torch.aten.view %1456, %1459 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1460, [%548], affine_map<()[s0] -> (s0 * 32, 14336)> : !torch.vtensor<[?,14336],f8E4M3FNUZ>
    %1461 = torch.aten.mm %1460, %1457 : !torch.vtensor<[?,14336],f8E4M3FNUZ>, !torch.vtensor<[14336,4096],f8E4M3FNUZ> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1461, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %int1_1117 = torch.constant.int 1
    %int4096_1118 = torch.constant.int 4096
    %1462 = torch.prim.ListConstruct %int1_1117, %1458, %int4096_1118 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1463 = torch.aten.view %1461, %1462 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1463, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int15_1119 = torch.constant.int 15
    %1464 = torch.prims.convert_element_type %1463, %int15_1119 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %1464, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int1_1120 = torch.constant.int 1
    %1465 = torch.aten.add.Tensor %1422, %1464, %int1_1120 : !torch.vtensor<[1,?,4096],bf16>, !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %1465, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int6_1121 = torch.constant.int 6
    %1466 = torch.prims.convert_element_type %1465, %int6_1121 : !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %1466, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int2_1122 = torch.constant.int 2
    %1467 = torch.aten.pow.Tensor_Scalar %1466, %int2_1122 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %1467, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int-1_1123 = torch.constant.int -1
    %1468 = torch.prim.ListConstruct %int-1_1123 : (!torch.int) -> !torch.list<int>
    %true_1124 = torch.constant.bool true
    %none_1125 = torch.constant.none
    %1469 = torch.aten.mean.dim %1467, %1468, %true_1124, %none_1125 : !torch.vtensor<[1,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %1469, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %float1.000000e-05_1126 = torch.constant.float 1.000000e-05
    %int1_1127 = torch.constant.int 1
    %1470 = torch.aten.add.Scalar %1469, %float1.000000e-05_1126, %int1_1127 : !torch.vtensor<[1,?,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %1470, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %1471 = torch.aten.rsqrt %1470 : !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %1471, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %1472 = torch.aten.mul.Tensor %1466, %1471 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %1472, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int15_1128 = torch.constant.int 15
    %1473 = torch.prims.convert_element_type %1472, %int15_1128 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %1473, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %1474 = torch.aten.mul.Tensor %52, %1473 : !torch.vtensor<[4096],bf16>, !torch.vtensor<[1,?,4096],bf16> -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %1474, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %1475 = torch.aten.div.Tensor %1474, %53 : !torch.vtensor<[1,?,4096],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %1475, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %float-2.400000e02_1129 = torch.constant.float -2.400000e+02
    %float2.400000e02_1130 = torch.constant.float 2.400000e+02
    %1476 = torch.aten.clamp %1475, %float-2.400000e02_1129, %float2.400000e02_1130 : !torch.vtensor<[1,?,4096],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %1476, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int26_1131 = torch.constant.int 26
    %1477 = torch.prims.convert_element_type %1476, %int26_1131 : !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1477, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_1132 = torch.constant.int -2
    %int-1_1133 = torch.constant.int -1
    %1478 = torch.aten.transpose.int %54, %int-2_1132, %int-1_1133 : !torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %int4096_1134 = torch.constant.int 4096
    %1479 = torch.prim.ListConstruct %564, %int4096_1134 : (!torch.int, !torch.int) -> !torch.list<int>
    %1480 = torch.aten.view %1477, %1479 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1480, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %1481 = torch.aten.mm %1480, %1478 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1481, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %int1_1135 = torch.constant.int 1
    %int4096_1136 = torch.constant.int 4096
    %1482 = torch.prim.ListConstruct %int1_1135, %564, %int4096_1136 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1483 = torch.aten.view %1481, %1482 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1483, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int15_1137 = torch.constant.int 15
    %1484 = torch.prims.convert_element_type %1483, %int15_1137 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %1484, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %1485 = torch.aten.div.Tensor %1474, %55 : !torch.vtensor<[1,?,4096],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %1485, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %float-2.400000e02_1138 = torch.constant.float -2.400000e+02
    %float2.400000e02_1139 = torch.constant.float 2.400000e+02
    %1486 = torch.aten.clamp %1485, %float-2.400000e02_1138, %float2.400000e02_1139 : !torch.vtensor<[1,?,4096],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %1486, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int26_1140 = torch.constant.int 26
    %1487 = torch.prims.convert_element_type %1486, %int26_1140 : !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1487, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_1141 = torch.constant.int -2
    %int-1_1142 = torch.constant.int -1
    %1488 = torch.aten.transpose.int %56, %int-2_1141, %int-1_1142 : !torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,1024],f8E4M3FNUZ>
    %int4096_1143 = torch.constant.int 4096
    %1489 = torch.prim.ListConstruct %564, %int4096_1143 : (!torch.int, !torch.int) -> !torch.list<int>
    %1490 = torch.aten.view %1487, %1489 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1490, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %1491 = torch.aten.mm %1490, %1488 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,1024],f8E4M3FNUZ> -> !torch.vtensor<[?,1024],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1491, [%548], affine_map<()[s0] -> (s0 * 32, 1024)> : !torch.vtensor<[?,1024],f8E4M3FNUZ>
    %int1_1144 = torch.constant.int 1
    %int1024_1145 = torch.constant.int 1024
    %1492 = torch.prim.ListConstruct %int1_1144, %564, %int1024_1145 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1493 = torch.aten.view %1491, %1492 : !torch.vtensor<[?,1024],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1493, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
    %int15_1146 = torch.constant.int 15
    %1494 = torch.prims.convert_element_type %1493, %int15_1146 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,1024],bf16>
    torch.bind_symbolic_shape %1494, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],bf16>
    %1495 = torch.aten.div.Tensor %1474, %57 : !torch.vtensor<[1,?,4096],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %1495, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %float-2.400000e02_1147 = torch.constant.float -2.400000e+02
    %float2.400000e02_1148 = torch.constant.float 2.400000e+02
    %1496 = torch.aten.clamp %1495, %float-2.400000e02_1147, %float2.400000e02_1148 : !torch.vtensor<[1,?,4096],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %1496, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int26_1149 = torch.constant.int 26
    %1497 = torch.prims.convert_element_type %1496, %int26_1149 : !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1497, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_1150 = torch.constant.int -2
    %int-1_1151 = torch.constant.int -1
    %1498 = torch.aten.transpose.int %58, %int-2_1150, %int-1_1151 : !torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,1024],f8E4M3FNUZ>
    %int4096_1152 = torch.constant.int 4096
    %1499 = torch.prim.ListConstruct %564, %int4096_1152 : (!torch.int, !torch.int) -> !torch.list<int>
    %1500 = torch.aten.view %1497, %1499 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1500, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %1501 = torch.aten.mm %1500, %1498 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,1024],f8E4M3FNUZ> -> !torch.vtensor<[?,1024],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1501, [%548], affine_map<()[s0] -> (s0 * 32, 1024)> : !torch.vtensor<[?,1024],f8E4M3FNUZ>
    %int1_1153 = torch.constant.int 1
    %int1024_1154 = torch.constant.int 1024
    %1502 = torch.prim.ListConstruct %int1_1153, %564, %int1024_1154 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1503 = torch.aten.view %1501, %1502 : !torch.vtensor<[?,1024],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1503, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
    %int15_1155 = torch.constant.int 15
    %1504 = torch.prims.convert_element_type %1503, %int15_1155 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,1024],bf16>
    torch.bind_symbolic_shape %1504, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],bf16>
    %int1_1156 = torch.constant.int 1
    %int32_1157 = torch.constant.int 32
    %int128_1158 = torch.constant.int 128
    %1505 = torch.prim.ListConstruct %int1_1156, %564, %int32_1157, %int128_1158 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1506 = torch.aten.view %1484, %1505 : !torch.vtensor<[1,?,4096],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,32,128],bf16>
    torch.bind_symbolic_shape %1506, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
    %int1_1159 = torch.constant.int 1
    %int8_1160 = torch.constant.int 8
    %int128_1161 = torch.constant.int 128
    %1507 = torch.prim.ListConstruct %int1_1159, %564, %int8_1160, %int128_1161 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1508 = torch.aten.view %1494, %1507 : !torch.vtensor<[1,?,1024],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %1508, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %int1_1162 = torch.constant.int 1
    %int8_1163 = torch.constant.int 8
    %int128_1164 = torch.constant.int 128
    %1509 = torch.prim.ListConstruct %int1_1162, %564, %int8_1163, %int128_1164 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1510 = torch.aten.view %1504, %1509 : !torch.vtensor<[1,?,1024],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %1510, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %int131072_1165 = torch.constant.int 131072
    %none_1166 = torch.constant.none
    %none_1167 = torch.constant.none
    %cpu_1168 = torch.constant.device "cpu"
    %false_1169 = torch.constant.bool false
    %1511 = torch.aten.arange %int131072_1165, %none_1166, %none_1167, %cpu_1168, %false_1169 : !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[131072],si64>
    %int0_1170 = torch.constant.int 0
    %int128_1171 = torch.constant.int 128
    %int2_1172 = torch.constant.int 2
    %int4_1173 = torch.constant.int 4
    %none_1174 = torch.constant.none
    %cpu_1175 = torch.constant.device "cpu"
    %false_1176 = torch.constant.bool false
    %1512 = torch.aten.arange.start_step %int0_1170, %int128_1171, %int2_1172, %int4_1173, %none_1174, %cpu_1175, %false_1176 : !torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[64],si64>
    %int6_1177 = torch.constant.int 6
    %1513 = torch.prims.convert_element_type %1512, %int6_1177 : !torch.vtensor<[64],si64>, !torch.int -> !torch.vtensor<[64],f32>
    %int128_1178 = torch.constant.int 128
    %1514 = torch.aten.div.Scalar %1513, %int128_1178 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %float5.000000e05_1179 = torch.constant.float 5.000000e+05
    %1515 = torch.aten.pow.Scalar %float5.000000e05_1179, %1514 : !torch.float, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %1516 = torch.aten.reciprocal %1515 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %float1.000000e00_1180 = torch.constant.float 1.000000e+00
    %1517 = torch.aten.mul.Scalar %1516, %float1.000000e00_1180 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],f32>
    %1518 = torch.aten.reciprocal %1517 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %float6.283190e00_1181 = torch.constant.float 6.2831853071795862
    %1519 = torch.aten.mul.Scalar %1518, %float6.283190e00_1181 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],f32>
    %float8.192000e03_1182 = torch.constant.float 8.192000e+03
    %1520 = torch.aten.gt.Scalar %1519, %float8.192000e03_1182 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1>
    %int8_1183 = torch.constant.int 8
    %1521 = torch.aten.div.Scalar %1517, %int8_1183 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %1522 = torch.aten.where.self %1520, %1521, %1517 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %1523 = torch.aten.reciprocal %1519 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %int8192_1184 = torch.constant.int 8192
    %1524 = torch.aten.mul.Scalar %1523, %int8192_1184 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %int1_1185 = torch.constant.int 1
    %int1_1186 = torch.constant.int 1
    %1525 = torch.aten.sub.Scalar %1524, %int1_1185, %int1_1186 : !torch.vtensor<[64],f32>, !torch.int, !torch.int -> !torch.vtensor<[64],f32>
    %int3_1187 = torch.constant.int 3
    %1526 = torch.aten.div.Scalar %1525, %int3_1187 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %int1_1188 = torch.constant.int 1
    %int1_1189 = torch.constant.int 1
    %1527 = torch.aten.rsub.Scalar %1526, %int1_1188, %int1_1189 : !torch.vtensor<[64],f32>, !torch.int, !torch.int -> !torch.vtensor<[64],f32>
    %1528 = torch.aten.mul.Tensor %1527, %1522 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %int8_1190 = torch.constant.int 8
    %1529 = torch.aten.div.Scalar %1528, %int8_1190 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %1530 = torch.aten.mul.Tensor %1526, %1522 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %int1_1191 = torch.constant.int 1
    %1531 = torch.aten.add.Tensor %1529, %1530, %int1_1191 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %float2.048000e03_1192 = torch.constant.float 2.048000e+03
    %1532 = torch.aten.lt.Scalar %1519, %float2.048000e03_1192 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1>
    %1533 = torch.aten.bitwise_not %1532 : !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1>
    %float8.192000e03_1193 = torch.constant.float 8.192000e+03
    %1534 = torch.aten.gt.Scalar %1519, %float8.192000e03_1193 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1>
    %1535 = torch.aten.bitwise_not %1534 : !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1>
    %1536 = torch.aten.mul.Tensor %1533, %1535 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1>
    %1537 = torch.aten.where.self %1536, %1531, %1522 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %1538 = torch.prim.ListConstruct %1537, %1537 : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor>
    %int-1_1194 = torch.constant.int -1
    %1539 = torch.aten.cat %1538, %int-1_1194 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[128],f32>
    %int6_1195 = torch.constant.int 6
    %1540 = torch.prims.convert_element_type %1511, %int6_1195 : !torch.vtensor<[131072],si64>, !torch.int -> !torch.vtensor<[131072],f32>
    %int131072_1196 = torch.constant.int 131072
    %int1_1197 = torch.constant.int 1
    %1541 = torch.prim.ListConstruct %int131072_1196, %int1_1197 : (!torch.int, !torch.int) -> !torch.list<int>
    %1542 = torch.aten.view %1540, %1541 : !torch.vtensor<[131072],f32>, !torch.list<int> -> !torch.vtensor<[131072,1],f32>
    %1543 = torch.aten.mul.Tensor %1542, %1539 : !torch.vtensor<[131072,1],f32>, !torch.vtensor<[128],f32> -> !torch.vtensor<[131072,128],f32>
    %1544 = torch.aten.cos %1543 : !torch.vtensor<[131072,128],f32> -> !torch.vtensor<[131072,128],f32>
    %int15_1198 = torch.constant.int 15
    %1545 = torch.prims.convert_element_type %1544, %int15_1198 : !torch.vtensor<[131072,128],f32>, !torch.int -> !torch.vtensor<[131072,128],bf16>
    %1546 = torch.aten.sin %1543 : !torch.vtensor<[131072,128],f32> -> !torch.vtensor<[131072,128],f32>
    %int15_1199 = torch.constant.int 15
    %1547 = torch.prims.convert_element_type %1546, %int15_1199 : !torch.vtensor<[131072,128],f32>, !torch.int -> !torch.vtensor<[131072,128],bf16>
    %int1_1200 = torch.constant.int 1
    %1548 = torch.aten.size.int %1483, %int1_1200 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.int
    %int0_1201 = torch.constant.int 0
    %1549 = torch.aten.add.int %int0_1201, %1548 : !torch.int, !torch.int -> !torch.int
    %int0_1202 = torch.constant.int 0
    %int0_1203 = torch.constant.int 0
    %int1_1204 = torch.constant.int 1
    %1550 = torch.aten.slice.Tensor %1545, %int0_1202, %int0_1203, %1549, %int1_1204 : !torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %1550, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int1_1205 = torch.constant.int 1
    %int0_1206 = torch.constant.int 0
    %int9223372036854775807_1207 = torch.constant.int 9223372036854775807
    %int1_1208 = torch.constant.int 1
    %1551 = torch.aten.slice.Tensor %1550, %int1_1205, %int0_1206, %int9223372036854775807_1207, %int1_1208 : !torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %1551, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int0_1209 = torch.constant.int 0
    %1552 = torch.aten.add.int %int0_1209, %1548 : !torch.int, !torch.int -> !torch.int
    %int0_1210 = torch.constant.int 0
    %int0_1211 = torch.constant.int 0
    %int1_1212 = torch.constant.int 1
    %1553 = torch.aten.slice.Tensor %1547, %int0_1210, %int0_1211, %1552, %int1_1212 : !torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %1553, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int1_1213 = torch.constant.int 1
    %int0_1214 = torch.constant.int 0
    %int9223372036854775807_1215 = torch.constant.int 9223372036854775807
    %int1_1216 = torch.constant.int 1
    %1554 = torch.aten.slice.Tensor %1553, %int1_1213, %int0_1214, %int9223372036854775807_1215, %int1_1216 : !torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %1554, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int0_1217 = torch.constant.int 0
    %1555 = torch.aten.unsqueeze %1551, %int0_1217 : !torch.vtensor<[?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %1555, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int1_1218 = torch.constant.int 1
    %int0_1219 = torch.constant.int 0
    %int9223372036854775807_1220 = torch.constant.int 9223372036854775807
    %int1_1221 = torch.constant.int 1
    %1556 = torch.aten.slice.Tensor %1555, %int1_1218, %int0_1219, %int9223372036854775807_1220, %int1_1221 : !torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %1556, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int2_1222 = torch.constant.int 2
    %1557 = torch.aten.unsqueeze %1556, %int2_1222 : !torch.vtensor<[1,?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %1557, [%548], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int3_1223 = torch.constant.int 3
    %int0_1224 = torch.constant.int 0
    %int9223372036854775807_1225 = torch.constant.int 9223372036854775807
    %int1_1226 = torch.constant.int 1
    %1558 = torch.aten.slice.Tensor %1557, %int3_1223, %int0_1224, %int9223372036854775807_1225, %int1_1226 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %1558, [%548], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int0_1227 = torch.constant.int 0
    %1559 = torch.aten.unsqueeze %1554, %int0_1227 : !torch.vtensor<[?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %1559, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int1_1228 = torch.constant.int 1
    %int0_1229 = torch.constant.int 0
    %int9223372036854775807_1230 = torch.constant.int 9223372036854775807
    %int1_1231 = torch.constant.int 1
    %1560 = torch.aten.slice.Tensor %1559, %int1_1228, %int0_1229, %int9223372036854775807_1230, %int1_1231 : !torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %1560, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int2_1232 = torch.constant.int 2
    %1561 = torch.aten.unsqueeze %1560, %int2_1232 : !torch.vtensor<[1,?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %1561, [%548], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int3_1233 = torch.constant.int 3
    %int0_1234 = torch.constant.int 0
    %int9223372036854775807_1235 = torch.constant.int 9223372036854775807
    %int1_1236 = torch.constant.int 1
    %1562 = torch.aten.slice.Tensor %1561, %int3_1233, %int0_1234, %int9223372036854775807_1235, %int1_1236 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %1562, [%548], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int1_1237 = torch.constant.int 1
    %int2_1238 = torch.constant.int 2
    %1563 = torch.aten.transpose.int %1558, %int1_1237, %int2_1238 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,?,128],bf16>
    torch.bind_symbolic_shape %1563, [%548], affine_map<()[s0] -> (1, 1, s0 * 32, 128)> : !torch.vtensor<[1,1,?,128],bf16>
    %int1_1239 = torch.constant.int 1
    %int1_1240 = torch.constant.int 1
    %int1_1241 = torch.constant.int 1
    %int1_1242 = torch.constant.int 1
    %1564 = torch.prim.ListConstruct %int1_1239, %int1_1240, %int1_1241, %int1_1242 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1565 = torch.aten.repeat %1563, %1564 : !torch.vtensor<[1,1,?,128],bf16>, !torch.list<int> -> !torch.vtensor<[1,1,?,128],bf16>
    torch.bind_symbolic_shape %1565, [%548], affine_map<()[s0] -> (1, 1, s0 * 32, 128)> : !torch.vtensor<[1,1,?,128],bf16>
    %int1_1243 = torch.constant.int 1
    %int2_1244 = torch.constant.int 2
    %1566 = torch.aten.transpose.int %1562, %int1_1243, %int2_1244 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,?,128],bf16>
    torch.bind_symbolic_shape %1566, [%548], affine_map<()[s0] -> (1, 1, s0 * 32, 128)> : !torch.vtensor<[1,1,?,128],bf16>
    %int1_1245 = torch.constant.int 1
    %int2_1246 = torch.constant.int 2
    %1567 = torch.aten.transpose.int %1506, %int1_1245, %int2_1246 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
    torch.bind_symbolic_shape %1567, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %int1_1247 = torch.constant.int 1
    %int1_1248 = torch.constant.int 1
    %int1_1249 = torch.constant.int 1
    %int1_1250 = torch.constant.int 1
    %1568 = torch.prim.ListConstruct %int1_1247, %int1_1248, %int1_1249, %int1_1250 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1569 = torch.aten.repeat %1566, %1568 : !torch.vtensor<[1,1,?,128],bf16>, !torch.list<int> -> !torch.vtensor<[1,1,?,128],bf16>
    torch.bind_symbolic_shape %1569, [%548], affine_map<()[s0] -> (1, 1, s0 * 32, 128)> : !torch.vtensor<[1,1,?,128],bf16>
    %1570 = torch.aten.mul.Tensor %1567, %1565 : !torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,1,?,128],bf16> -> !torch.vtensor<[1,32,?,128],bf16>
    torch.bind_symbolic_shape %1570, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %int3_1251 = torch.constant.int 3
    %int0_1252 = torch.constant.int 0
    %int64_1253 = torch.constant.int 64
    %int1_1254 = torch.constant.int 1
    %1571 = torch.aten.slice.Tensor %1567, %int3_1251, %int0_1252, %int64_1253, %int1_1254 : !torch.vtensor<[1,32,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,64],bf16>
    torch.bind_symbolic_shape %1571, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 64)> : !torch.vtensor<[1,32,?,64],bf16>
    %int3_1255 = torch.constant.int 3
    %int64_1256 = torch.constant.int 64
    %int9223372036854775807_1257 = torch.constant.int 9223372036854775807
    %int1_1258 = torch.constant.int 1
    %1572 = torch.aten.slice.Tensor %1567, %int3_1255, %int64_1256, %int9223372036854775807_1257, %int1_1258 : !torch.vtensor<[1,32,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,64],bf16>
    torch.bind_symbolic_shape %1572, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 64)> : !torch.vtensor<[1,32,?,64],bf16>
    %1573 = torch.aten.neg %1572 : !torch.vtensor<[1,32,?,64],bf16> -> !torch.vtensor<[1,32,?,64],bf16>
    torch.bind_symbolic_shape %1573, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 64)> : !torch.vtensor<[1,32,?,64],bf16>
    %1574 = torch.prim.ListConstruct %1573, %1571 : (!torch.vtensor<[1,32,?,64],bf16>, !torch.vtensor<[1,32,?,64],bf16>) -> !torch.list<vtensor>
    %int-1_1259 = torch.constant.int -1
    %1575 = torch.aten.cat %1574, %int-1_1259 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
    torch.bind_symbolic_shape %1575, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %1576 = torch.aten.mul.Tensor %1575, %1569 : !torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,1,?,128],bf16> -> !torch.vtensor<[1,32,?,128],bf16>
    torch.bind_symbolic_shape %1576, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %int1_1260 = torch.constant.int 1
    %1577 = torch.aten.add.Tensor %1570, %1576, %int1_1260 : !torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,32,?,128],bf16>, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
    torch.bind_symbolic_shape %1577, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %int1_1261 = torch.constant.int 1
    %int2_1262 = torch.constant.int 2
    %1578 = torch.aten.transpose.int %1577, %int1_1261, %int2_1262 : !torch.vtensor<[1,32,?,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
    torch.bind_symbolic_shape %1578, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
    %int131072_1263 = torch.constant.int 131072
    %none_1264 = torch.constant.none
    %none_1265 = torch.constant.none
    %cpu_1266 = torch.constant.device "cpu"
    %false_1267 = torch.constant.bool false
    %1579 = torch.aten.arange %int131072_1263, %none_1264, %none_1265, %cpu_1266, %false_1267 : !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[131072],si64>
    %int0_1268 = torch.constant.int 0
    %int128_1269 = torch.constant.int 128
    %int2_1270 = torch.constant.int 2
    %int4_1271 = torch.constant.int 4
    %none_1272 = torch.constant.none
    %cpu_1273 = torch.constant.device "cpu"
    %false_1274 = torch.constant.bool false
    %1580 = torch.aten.arange.start_step %int0_1268, %int128_1269, %int2_1270, %int4_1271, %none_1272, %cpu_1273, %false_1274 : !torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[64],si64>
    %int6_1275 = torch.constant.int 6
    %1581 = torch.prims.convert_element_type %1580, %int6_1275 : !torch.vtensor<[64],si64>, !torch.int -> !torch.vtensor<[64],f32>
    %int128_1276 = torch.constant.int 128
    %1582 = torch.aten.div.Scalar %1581, %int128_1276 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %float5.000000e05_1277 = torch.constant.float 5.000000e+05
    %1583 = torch.aten.pow.Scalar %float5.000000e05_1277, %1582 : !torch.float, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %1584 = torch.aten.reciprocal %1583 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %float1.000000e00_1278 = torch.constant.float 1.000000e+00
    %1585 = torch.aten.mul.Scalar %1584, %float1.000000e00_1278 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],f32>
    %1586 = torch.aten.reciprocal %1585 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %float6.283190e00_1279 = torch.constant.float 6.2831853071795862
    %1587 = torch.aten.mul.Scalar %1586, %float6.283190e00_1279 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],f32>
    %float8.192000e03_1280 = torch.constant.float 8.192000e+03
    %1588 = torch.aten.gt.Scalar %1587, %float8.192000e03_1280 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1>
    %int8_1281 = torch.constant.int 8
    %1589 = torch.aten.div.Scalar %1585, %int8_1281 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %1590 = torch.aten.where.self %1588, %1589, %1585 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %1591 = torch.aten.reciprocal %1587 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %int8192_1282 = torch.constant.int 8192
    %1592 = torch.aten.mul.Scalar %1591, %int8192_1282 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %int1_1283 = torch.constant.int 1
    %int1_1284 = torch.constant.int 1
    %1593 = torch.aten.sub.Scalar %1592, %int1_1283, %int1_1284 : !torch.vtensor<[64],f32>, !torch.int, !torch.int -> !torch.vtensor<[64],f32>
    %int3_1285 = torch.constant.int 3
    %1594 = torch.aten.div.Scalar %1593, %int3_1285 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %int1_1286 = torch.constant.int 1
    %int1_1287 = torch.constant.int 1
    %1595 = torch.aten.rsub.Scalar %1594, %int1_1286, %int1_1287 : !torch.vtensor<[64],f32>, !torch.int, !torch.int -> !torch.vtensor<[64],f32>
    %1596 = torch.aten.mul.Tensor %1595, %1590 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %int8_1288 = torch.constant.int 8
    %1597 = torch.aten.div.Scalar %1596, %int8_1288 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %1598 = torch.aten.mul.Tensor %1594, %1590 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %int1_1289 = torch.constant.int 1
    %1599 = torch.aten.add.Tensor %1597, %1598, %int1_1289 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %float2.048000e03_1290 = torch.constant.float 2.048000e+03
    %1600 = torch.aten.lt.Scalar %1587, %float2.048000e03_1290 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1>
    %1601 = torch.aten.bitwise_not %1600 : !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1>
    %float8.192000e03_1291 = torch.constant.float 8.192000e+03
    %1602 = torch.aten.gt.Scalar %1587, %float8.192000e03_1291 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1>
    %1603 = torch.aten.bitwise_not %1602 : !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1>
    %1604 = torch.aten.mul.Tensor %1601, %1603 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1>
    %1605 = torch.aten.where.self %1604, %1599, %1590 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %1606 = torch.prim.ListConstruct %1605, %1605 : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor>
    %int-1_1292 = torch.constant.int -1
    %1607 = torch.aten.cat %1606, %int-1_1292 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[128],f32>
    %int6_1293 = torch.constant.int 6
    %1608 = torch.prims.convert_element_type %1579, %int6_1293 : !torch.vtensor<[131072],si64>, !torch.int -> !torch.vtensor<[131072],f32>
    %int131072_1294 = torch.constant.int 131072
    %int1_1295 = torch.constant.int 1
    %1609 = torch.prim.ListConstruct %int131072_1294, %int1_1295 : (!torch.int, !torch.int) -> !torch.list<int>
    %1610 = torch.aten.view %1608, %1609 : !torch.vtensor<[131072],f32>, !torch.list<int> -> !torch.vtensor<[131072,1],f32>
    %1611 = torch.aten.mul.Tensor %1610, %1607 : !torch.vtensor<[131072,1],f32>, !torch.vtensor<[128],f32> -> !torch.vtensor<[131072,128],f32>
    %1612 = torch.aten.cos %1611 : !torch.vtensor<[131072,128],f32> -> !torch.vtensor<[131072,128],f32>
    %int15_1296 = torch.constant.int 15
    %1613 = torch.prims.convert_element_type %1612, %int15_1296 : !torch.vtensor<[131072,128],f32>, !torch.int -> !torch.vtensor<[131072,128],bf16>
    %1614 = torch.aten.sin %1611 : !torch.vtensor<[131072,128],f32> -> !torch.vtensor<[131072,128],f32>
    %int15_1297 = torch.constant.int 15
    %1615 = torch.prims.convert_element_type %1614, %int15_1297 : !torch.vtensor<[131072,128],f32>, !torch.int -> !torch.vtensor<[131072,128],bf16>
    %int1_1298 = torch.constant.int 1
    %1616 = torch.aten.size.int %1493, %int1_1298 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.int
    %int0_1299 = torch.constant.int 0
    %1617 = torch.aten.add.int %int0_1299, %1616 : !torch.int, !torch.int -> !torch.int
    %int0_1300 = torch.constant.int 0
    %int0_1301 = torch.constant.int 0
    %int1_1302 = torch.constant.int 1
    %1618 = torch.aten.slice.Tensor %1613, %int0_1300, %int0_1301, %1617, %int1_1302 : !torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %1618, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int1_1303 = torch.constant.int 1
    %int0_1304 = torch.constant.int 0
    %int9223372036854775807_1305 = torch.constant.int 9223372036854775807
    %int1_1306 = torch.constant.int 1
    %1619 = torch.aten.slice.Tensor %1618, %int1_1303, %int0_1304, %int9223372036854775807_1305, %int1_1306 : !torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %1619, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int0_1307 = torch.constant.int 0
    %1620 = torch.aten.add.int %int0_1307, %1616 : !torch.int, !torch.int -> !torch.int
    %int0_1308 = torch.constant.int 0
    %int0_1309 = torch.constant.int 0
    %int1_1310 = torch.constant.int 1
    %1621 = torch.aten.slice.Tensor %1615, %int0_1308, %int0_1309, %1620, %int1_1310 : !torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %1621, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int1_1311 = torch.constant.int 1
    %int0_1312 = torch.constant.int 0
    %int9223372036854775807_1313 = torch.constant.int 9223372036854775807
    %int1_1314 = torch.constant.int 1
    %1622 = torch.aten.slice.Tensor %1621, %int1_1311, %int0_1312, %int9223372036854775807_1313, %int1_1314 : !torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %1622, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int0_1315 = torch.constant.int 0
    %1623 = torch.aten.unsqueeze %1619, %int0_1315 : !torch.vtensor<[?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %1623, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int1_1316 = torch.constant.int 1
    %int0_1317 = torch.constant.int 0
    %int9223372036854775807_1318 = torch.constant.int 9223372036854775807
    %int1_1319 = torch.constant.int 1
    %1624 = torch.aten.slice.Tensor %1623, %int1_1316, %int0_1317, %int9223372036854775807_1318, %int1_1319 : !torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %1624, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int2_1320 = torch.constant.int 2
    %1625 = torch.aten.unsqueeze %1624, %int2_1320 : !torch.vtensor<[1,?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %1625, [%548], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int3_1321 = torch.constant.int 3
    %int0_1322 = torch.constant.int 0
    %int9223372036854775807_1323 = torch.constant.int 9223372036854775807
    %int1_1324 = torch.constant.int 1
    %1626 = torch.aten.slice.Tensor %1625, %int3_1321, %int0_1322, %int9223372036854775807_1323, %int1_1324 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %1626, [%548], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int0_1325 = torch.constant.int 0
    %1627 = torch.aten.unsqueeze %1622, %int0_1325 : !torch.vtensor<[?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %1627, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int1_1326 = torch.constant.int 1
    %int0_1327 = torch.constant.int 0
    %int9223372036854775807_1328 = torch.constant.int 9223372036854775807
    %int1_1329 = torch.constant.int 1
    %1628 = torch.aten.slice.Tensor %1627, %int1_1326, %int0_1327, %int9223372036854775807_1328, %int1_1329 : !torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %1628, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int2_1330 = torch.constant.int 2
    %1629 = torch.aten.unsqueeze %1628, %int2_1330 : !torch.vtensor<[1,?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %1629, [%548], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int3_1331 = torch.constant.int 3
    %int0_1332 = torch.constant.int 0
    %int9223372036854775807_1333 = torch.constant.int 9223372036854775807
    %int1_1334 = torch.constant.int 1
    %1630 = torch.aten.slice.Tensor %1629, %int3_1331, %int0_1332, %int9223372036854775807_1333, %int1_1334 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %1630, [%548], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int1_1335 = torch.constant.int 1
    %int2_1336 = torch.constant.int 2
    %1631 = torch.aten.transpose.int %1626, %int1_1335, %int2_1336 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,?,128],bf16>
    torch.bind_symbolic_shape %1631, [%548], affine_map<()[s0] -> (1, 1, s0 * 32, 128)> : !torch.vtensor<[1,1,?,128],bf16>
    %int1_1337 = torch.constant.int 1
    %int1_1338 = torch.constant.int 1
    %int1_1339 = torch.constant.int 1
    %int1_1340 = torch.constant.int 1
    %1632 = torch.prim.ListConstruct %int1_1337, %int1_1338, %int1_1339, %int1_1340 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1633 = torch.aten.repeat %1631, %1632 : !torch.vtensor<[1,1,?,128],bf16>, !torch.list<int> -> !torch.vtensor<[1,1,?,128],bf16>
    torch.bind_symbolic_shape %1633, [%548], affine_map<()[s0] -> (1, 1, s0 * 32, 128)> : !torch.vtensor<[1,1,?,128],bf16>
    %int1_1341 = torch.constant.int 1
    %int2_1342 = torch.constant.int 2
    %1634 = torch.aten.transpose.int %1630, %int1_1341, %int2_1342 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,?,128],bf16>
    torch.bind_symbolic_shape %1634, [%548], affine_map<()[s0] -> (1, 1, s0 * 32, 128)> : !torch.vtensor<[1,1,?,128],bf16>
    %int1_1343 = torch.constant.int 1
    %int2_1344 = torch.constant.int 2
    %1635 = torch.aten.transpose.int %1508, %int1_1343, %int2_1344 : !torch.vtensor<[1,?,8,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,8,?,128],bf16>
    torch.bind_symbolic_shape %1635, [%548], affine_map<()[s0] -> (1, 8, s0 * 32, 128)> : !torch.vtensor<[1,8,?,128],bf16>
    %int1_1345 = torch.constant.int 1
    %int1_1346 = torch.constant.int 1
    %int1_1347 = torch.constant.int 1
    %int1_1348 = torch.constant.int 1
    %1636 = torch.prim.ListConstruct %int1_1345, %int1_1346, %int1_1347, %int1_1348 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1637 = torch.aten.repeat %1634, %1636 : !torch.vtensor<[1,1,?,128],bf16>, !torch.list<int> -> !torch.vtensor<[1,1,?,128],bf16>
    torch.bind_symbolic_shape %1637, [%548], affine_map<()[s0] -> (1, 1, s0 * 32, 128)> : !torch.vtensor<[1,1,?,128],bf16>
    %1638 = torch.aten.mul.Tensor %1635, %1633 : !torch.vtensor<[1,8,?,128],bf16>, !torch.vtensor<[1,1,?,128],bf16> -> !torch.vtensor<[1,8,?,128],bf16>
    torch.bind_symbolic_shape %1638, [%548], affine_map<()[s0] -> (1, 8, s0 * 32, 128)> : !torch.vtensor<[1,8,?,128],bf16>
    %int3_1349 = torch.constant.int 3
    %int0_1350 = torch.constant.int 0
    %int64_1351 = torch.constant.int 64
    %int1_1352 = torch.constant.int 1
    %1639 = torch.aten.slice.Tensor %1635, %int3_1349, %int0_1350, %int64_1351, %int1_1352 : !torch.vtensor<[1,8,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,8,?,64],bf16>
    torch.bind_symbolic_shape %1639, [%548], affine_map<()[s0] -> (1, 8, s0 * 32, 64)> : !torch.vtensor<[1,8,?,64],bf16>
    %int3_1353 = torch.constant.int 3
    %int64_1354 = torch.constant.int 64
    %int9223372036854775807_1355 = torch.constant.int 9223372036854775807
    %int1_1356 = torch.constant.int 1
    %1640 = torch.aten.slice.Tensor %1635, %int3_1353, %int64_1354, %int9223372036854775807_1355, %int1_1356 : !torch.vtensor<[1,8,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,8,?,64],bf16>
    torch.bind_symbolic_shape %1640, [%548], affine_map<()[s0] -> (1, 8, s0 * 32, 64)> : !torch.vtensor<[1,8,?,64],bf16>
    %1641 = torch.aten.neg %1640 : !torch.vtensor<[1,8,?,64],bf16> -> !torch.vtensor<[1,8,?,64],bf16>
    torch.bind_symbolic_shape %1641, [%548], affine_map<()[s0] -> (1, 8, s0 * 32, 64)> : !torch.vtensor<[1,8,?,64],bf16>
    %1642 = torch.prim.ListConstruct %1641, %1639 : (!torch.vtensor<[1,8,?,64],bf16>, !torch.vtensor<[1,8,?,64],bf16>) -> !torch.list<vtensor>
    %int-1_1357 = torch.constant.int -1
    %1643 = torch.aten.cat %1642, %int-1_1357 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,8,?,128],bf16>
    torch.bind_symbolic_shape %1643, [%548], affine_map<()[s0] -> (1, 8, s0 * 32, 128)> : !torch.vtensor<[1,8,?,128],bf16>
    %1644 = torch.aten.mul.Tensor %1643, %1637 : !torch.vtensor<[1,8,?,128],bf16>, !torch.vtensor<[1,1,?,128],bf16> -> !torch.vtensor<[1,8,?,128],bf16>
    torch.bind_symbolic_shape %1644, [%548], affine_map<()[s0] -> (1, 8, s0 * 32, 128)> : !torch.vtensor<[1,8,?,128],bf16>
    %int1_1358 = torch.constant.int 1
    %1645 = torch.aten.add.Tensor %1638, %1644, %int1_1358 : !torch.vtensor<[1,8,?,128],bf16>, !torch.vtensor<[1,8,?,128],bf16>, !torch.int -> !torch.vtensor<[1,8,?,128],bf16>
    torch.bind_symbolic_shape %1645, [%548], affine_map<()[s0] -> (1, 8, s0 * 32, 128)> : !torch.vtensor<[1,8,?,128],bf16>
    %int1_1359 = torch.constant.int 1
    %int2_1360 = torch.constant.int 2
    %1646 = torch.aten.transpose.int %1645, %int1_1359, %int2_1360 : !torch.vtensor<[1,8,?,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %1646, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %1647 = torch.aten.div.Tensor %1646, %59 : !torch.vtensor<[1,?,8,128],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %1647, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %float-2.400000e02_1361 = torch.constant.float -2.400000e+02
    %float2.400000e02_1362 = torch.constant.float 2.400000e+02
    %1648 = torch.aten.clamp %1647, %float-2.400000e02_1361, %float2.400000e02_1362 : !torch.vtensor<[1,?,8,128],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %1648, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %int26_1363 = torch.constant.int 26
    %1649 = torch.prims.convert_element_type %1648, %int26_1363 : !torch.vtensor<[1,?,8,128],bf16>, !torch.int -> !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1649, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
    %1650 = torch.aten.div.Tensor %1510, %59 : !torch.vtensor<[1,?,8,128],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %1650, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %float-2.400000e02_1364 = torch.constant.float -2.400000e+02
    %float2.400000e02_1365 = torch.constant.float 2.400000e+02
    %1651 = torch.aten.clamp %1650, %float-2.400000e02_1364, %float2.400000e02_1365 : !torch.vtensor<[1,?,8,128],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %1651, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %int26_1366 = torch.constant.int 26
    %1652 = torch.prims.convert_element_type %1651, %int26_1366 : !torch.vtensor<[1,?,8,128],bf16>, !torch.int -> !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1652, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
    %int64_1367 = torch.constant.int 64
    %1653 = torch.aten.mul.Scalar %arg2, %int64_1367 : !torch.vtensor<[1,?],si64>, !torch.int -> !torch.vtensor<[1,?],si64>
    torch.bind_symbolic_shape %1653, [%548], affine_map<()[s0] -> (1, s0)> : !torch.vtensor<[1,?],si64>
    %int6_1368 = torch.constant.int 6
    %int1_1369 = torch.constant.int 1
    %1654 = torch.aten.add.Scalar %1653, %int6_1368, %int1_1369 : !torch.vtensor<[1,?],si64>, !torch.int, !torch.int -> !torch.vtensor<[1,?],si64>
    torch.bind_symbolic_shape %1654, [%548], affine_map<()[s0] -> (1, s0)> : !torch.vtensor<[1,?],si64>
    %int1_1370 = torch.constant.int 1
    %int32_1371 = torch.constant.int 32
    %int8_1372 = torch.constant.int 8
    %int128_1373 = torch.constant.int 128
    %1655 = torch.prim.ListConstruct %int1_1370, %748, %int32_1371, %int8_1372, %int128_1373 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1656 = torch.aten.view %1649, %1655 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1656, [%548], affine_map<()[s0] -> (1, s0, 32, 8, 128)> : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
    %int32_1374 = torch.constant.int 32
    %int8_1375 = torch.constant.int 8
    %int128_1376 = torch.constant.int 128
    %1657 = torch.prim.ListConstruct %748, %int32_1374, %int8_1375, %int128_1376 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1658 = torch.aten.view %1656, %1657 : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1658, [%548], affine_map<()[s0] -> (s0, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %1659 = torch.prim.ListConstruct %748 : (!torch.int) -> !torch.list<int>
    %1660 = torch.aten.view %1654, %1659 : !torch.vtensor<[1,?],si64>, !torch.list<int> -> !torch.vtensor<[?],si64>
    torch.bind_symbolic_shape %1660, [%548], affine_map<()[s0] -> (s0)> : !torch.vtensor<[?],si64>
    %int32_1377 = torch.constant.int 32
    %int2_1378 = torch.constant.int 2
    %int32_1379 = torch.constant.int 32
    %int8_1380 = torch.constant.int 8
    %int128_1381 = torch.constant.int 128
    %1661 = torch.prim.ListConstruct %739, %int32_1377, %int2_1378, %int32_1379, %int8_1380, %int128_1381 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1662 = torch.aten.view %1385, %1661 : !torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1662, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    %int32_1382 = torch.constant.int 32
    %1663 = torch.aten.mul.int %739, %int32_1382 : !torch.int, !torch.int -> !torch.int
    %int2_1383 = torch.constant.int 2
    %1664 = torch.aten.mul.int %1663, %int2_1383 : !torch.int, !torch.int -> !torch.int
    %int32_1384 = torch.constant.int 32
    %int8_1385 = torch.constant.int 8
    %int128_1386 = torch.constant.int 128
    %1665 = torch.prim.ListConstruct %1664, %int32_1384, %int8_1385, %int128_1386 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1666 = torch.aten.view %1662, %1665 : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1666, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %1667 = torch.prim.ListConstruct %1660 : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %false_1387 = torch.constant.bool false
    %1668 = torch.aten.index_put %1666, %1667, %1658, %false_1387 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1668, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %int32_1388 = torch.constant.int 32
    %int2_1389 = torch.constant.int 2
    %int32_1390 = torch.constant.int 32
    %int8_1391 = torch.constant.int 8
    %int128_1392 = torch.constant.int 128
    %1669 = torch.prim.ListConstruct %739, %int32_1388, %int2_1389, %int32_1390, %int8_1391, %int128_1392 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1670 = torch.aten.view %1668, %1669 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1670, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    %int2097152_1393 = torch.constant.int 2097152
    %1671 = torch.prim.ListConstruct %739, %int2097152_1393 : (!torch.int, !torch.int) -> !torch.list<int>
    %1672 = torch.aten.view %1670, %1671 : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1672, [%549], affine_map<()[s0] -> (s0, 2097152)> : !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    %int32_1394 = torch.constant.int 32
    %int2_1395 = torch.constant.int 2
    %int32_1396 = torch.constant.int 32
    %int8_1397 = torch.constant.int 8
    %int128_1398 = torch.constant.int 128
    %1673 = torch.prim.ListConstruct %739, %int32_1394, %int2_1395, %int32_1396, %int8_1397, %int128_1398 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1674 = torch.aten.view %1672, %1673 : !torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1674, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    %int32_1399 = torch.constant.int 32
    %int8_1400 = torch.constant.int 8
    %int128_1401 = torch.constant.int 128
    %1675 = torch.prim.ListConstruct %1664, %int32_1399, %int8_1400, %int128_1401 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1676 = torch.aten.view %1674, %1675 : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1676, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %int1_1402 = torch.constant.int 1
    %int32_1403 = torch.constant.int 32
    %int8_1404 = torch.constant.int 8
    %int128_1405 = torch.constant.int 128
    %1677 = torch.prim.ListConstruct %int1_1402, %748, %int32_1403, %int8_1404, %int128_1405 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1678 = torch.aten.view %1652, %1677 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1678, [%548], affine_map<()[s0] -> (1, s0, 32, 8, 128)> : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
    %int32_1406 = torch.constant.int 32
    %int8_1407 = torch.constant.int 8
    %int128_1408 = torch.constant.int 128
    %1679 = torch.prim.ListConstruct %748, %int32_1406, %int8_1407, %int128_1408 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1680 = torch.aten.view %1678, %1679 : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1680, [%548], affine_map<()[s0] -> (s0, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %int1_1409 = torch.constant.int 1
    %int1_1410 = torch.constant.int 1
    %1681 = torch.aten.add.Scalar %1654, %int1_1409, %int1_1410 : !torch.vtensor<[1,?],si64>, !torch.int, !torch.int -> !torch.vtensor<[1,?],si64>
    torch.bind_symbolic_shape %1681, [%548], affine_map<()[s0] -> (1, s0)> : !torch.vtensor<[1,?],si64>
    %1682 = torch.prim.ListConstruct %748 : (!torch.int) -> !torch.list<int>
    %1683 = torch.aten.view %1681, %1682 : !torch.vtensor<[1,?],si64>, !torch.list<int> -> !torch.vtensor<[?],si64>
    torch.bind_symbolic_shape %1683, [%548], affine_map<()[s0] -> (s0)> : !torch.vtensor<[?],si64>
    %1684 = torch.prim.ListConstruct %1683 : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %false_1411 = torch.constant.bool false
    %1685 = torch.aten.index_put %1676, %1684, %1680, %false_1411 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1685, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %int32_1412 = torch.constant.int 32
    %int2_1413 = torch.constant.int 2
    %int32_1414 = torch.constant.int 32
    %int8_1415 = torch.constant.int 8
    %int128_1416 = torch.constant.int 128
    %1686 = torch.prim.ListConstruct %739, %int32_1412, %int2_1413, %int32_1414, %int8_1415, %int128_1416 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1687 = torch.aten.view %1685, %1686 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1687, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    %int2097152_1417 = torch.constant.int 2097152
    %1688 = torch.prim.ListConstruct %739, %int2097152_1417 : (!torch.int, !torch.int) -> !torch.list<int>
    %1689 = torch.aten.view %1687, %1688 : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1689, [%549], affine_map<()[s0] -> (s0, 2097152)> : !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    %int-2_1418 = torch.constant.int -2
    %1690 = torch.aten.unsqueeze %1649, %int-2_1418 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1690, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 1, 128)> : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
    %int1_1419 = torch.constant.int 1
    %int8_1420 = torch.constant.int 8
    %int4_1421 = torch.constant.int 4
    %int128_1422 = torch.constant.int 128
    %1691 = torch.prim.ListConstruct %int1_1419, %1616, %int8_1420, %int4_1421, %int128_1422 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %false_1423 = torch.constant.bool false
    %1692 = torch.aten.expand %1690, %1691, %false_1423 : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1692, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    %int0_1424 = torch.constant.int 0
    %1693 = torch.aten.clone %1692, %int0_1424 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1693, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    %int1_1425 = torch.constant.int 1
    %int32_1426 = torch.constant.int 32
    %int128_1427 = torch.constant.int 128
    %1694 = torch.prim.ListConstruct %int1_1425, %1616, %int32_1426, %int128_1427 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1695 = torch.aten._unsafe_view %1693, %1694 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1695, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
    %int-2_1428 = torch.constant.int -2
    %1696 = torch.aten.unsqueeze %1652, %int-2_1428 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1696, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 1, 128)> : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
    %int1_1429 = torch.constant.int 1
    %1697 = torch.aten.size.int %1503, %int1_1429 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.int
    %int1_1430 = torch.constant.int 1
    %int8_1431 = torch.constant.int 8
    %int4_1432 = torch.constant.int 4
    %int128_1433 = torch.constant.int 128
    %1698 = torch.prim.ListConstruct %int1_1430, %1697, %int8_1431, %int4_1432, %int128_1433 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %false_1434 = torch.constant.bool false
    %1699 = torch.aten.expand %1696, %1698, %false_1434 : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1699, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    %int0_1435 = torch.constant.int 0
    %1700 = torch.aten.clone %1699, %int0_1435 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1700, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    %int1_1436 = torch.constant.int 1
    %int32_1437 = torch.constant.int 32
    %int128_1438 = torch.constant.int 128
    %1701 = torch.prim.ListConstruct %int1_1436, %1697, %int32_1437, %int128_1438 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1702 = torch.aten._unsafe_view %1700, %1701 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1702, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
    %int6_1439 = torch.constant.int 6
    %1703 = torch.prims.convert_element_type %1695, %int6_1439 : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,32,128],f32>
    torch.bind_symbolic_shape %1703, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
    %1704 = torch.aten.mul.Tensor %1703, %59 : !torch.vtensor<[1,?,32,128],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,32,128],f32>
    torch.bind_symbolic_shape %1704, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
    %int15_1440 = torch.constant.int 15
    %1705 = torch.prims.convert_element_type %1704, %int15_1440 : !torch.vtensor<[1,?,32,128],f32>, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
    torch.bind_symbolic_shape %1705, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
    %int6_1441 = torch.constant.int 6
    %1706 = torch.prims.convert_element_type %1702, %int6_1441 : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,32,128],f32>
    torch.bind_symbolic_shape %1706, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
    %1707 = torch.aten.mul.Tensor %1706, %59 : !torch.vtensor<[1,?,32,128],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,32,128],f32>
    torch.bind_symbolic_shape %1707, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
    %int15_1442 = torch.constant.int 15
    %1708 = torch.prims.convert_element_type %1707, %int15_1442 : !torch.vtensor<[1,?,32,128],f32>, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
    torch.bind_symbolic_shape %1708, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
    %int1_1443 = torch.constant.int 1
    %int2_1444 = torch.constant.int 2
    %1709 = torch.aten.transpose.int %1578, %int1_1443, %int2_1444 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
    torch.bind_symbolic_shape %1709, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %int1_1445 = torch.constant.int 1
    %int2_1446 = torch.constant.int 2
    %1710 = torch.aten.transpose.int %1705, %int1_1445, %int2_1446 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
    torch.bind_symbolic_shape %1710, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %int1_1447 = torch.constant.int 1
    %int2_1448 = torch.constant.int 2
    %1711 = torch.aten.transpose.int %1708, %int1_1447, %int2_1448 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
    torch.bind_symbolic_shape %1711, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %float0.000000e00_1449 = torch.constant.float 0.000000e+00
    %true_1450 = torch.constant.bool true
    %none_1451 = torch.constant.none
    %none_1452 = torch.constant.none
    %1712:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%1709, %1710, %1711, %float0.000000e00_1449, %true_1450, %none_1451, %none_1452) : (!torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,32,?,128],bf16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,32,?],f32>) 
    torch.bind_symbolic_shape %1712#0, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %int1_1453 = torch.constant.int 1
    %int2_1454 = torch.constant.int 2
    %1713 = torch.aten.transpose.int %1712#0, %int1_1453, %int2_1454 : !torch.vtensor<[1,32,?,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
    torch.bind_symbolic_shape %1713, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
    %int1_1455 = torch.constant.int 1
    %int4096_1456 = torch.constant.int 4096
    %1714 = torch.prim.ListConstruct %int1_1455, %1548, %int4096_1456 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1715 = torch.aten.view %1713, %1714 : !torch.vtensor<[1,?,32,128],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %1715, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %1716 = torch.aten.div.Tensor %1715, %60 : !torch.vtensor<[1,?,4096],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %1716, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %float-2.400000e02_1457 = torch.constant.float -2.400000e+02
    %float2.400000e02_1458 = torch.constant.float 2.400000e+02
    %1717 = torch.aten.clamp %1716, %float-2.400000e02_1457, %float2.400000e02_1458 : !torch.vtensor<[1,?,4096],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %1717, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int26_1459 = torch.constant.int 26
    %1718 = torch.prims.convert_element_type %1717, %int26_1459 : !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1718, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_1460 = torch.constant.int -2
    %int-1_1461 = torch.constant.int -1
    %1719 = torch.aten.transpose.int %61, %int-2_1460, %int-1_1461 : !torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %int4096_1462 = torch.constant.int 4096
    %1720 = torch.prim.ListConstruct %1548, %int4096_1462 : (!torch.int, !torch.int) -> !torch.list<int>
    %1721 = torch.aten.view %1718, %1720 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1721, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %1722 = torch.aten.mm %1721, %1719 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1722, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %int1_1463 = torch.constant.int 1
    %int4096_1464 = torch.constant.int 4096
    %1723 = torch.prim.ListConstruct %int1_1463, %1548, %int4096_1464 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1724 = torch.aten.view %1722, %1723 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1724, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int15_1465 = torch.constant.int 15
    %1725 = torch.prims.convert_element_type %1724, %int15_1465 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %1725, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int1_1466 = torch.constant.int 1
    %1726 = torch.aten.add.Tensor %1465, %1725, %int1_1466 : !torch.vtensor<[1,?,4096],bf16>, !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %1726, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int6_1467 = torch.constant.int 6
    %1727 = torch.prims.convert_element_type %1726, %int6_1467 : !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %1727, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int2_1468 = torch.constant.int 2
    %1728 = torch.aten.pow.Tensor_Scalar %1727, %int2_1468 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %1728, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int-1_1469 = torch.constant.int -1
    %1729 = torch.prim.ListConstruct %int-1_1469 : (!torch.int) -> !torch.list<int>
    %true_1470 = torch.constant.bool true
    %none_1471 = torch.constant.none
    %1730 = torch.aten.mean.dim %1728, %1729, %true_1470, %none_1471 : !torch.vtensor<[1,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %1730, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %float1.000000e-05_1472 = torch.constant.float 1.000000e-05
    %int1_1473 = torch.constant.int 1
    %1731 = torch.aten.add.Scalar %1730, %float1.000000e-05_1472, %int1_1473 : !torch.vtensor<[1,?,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %1731, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %1732 = torch.aten.rsqrt %1731 : !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %1732, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %1733 = torch.aten.mul.Tensor %1727, %1732 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %1733, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int15_1474 = torch.constant.int 15
    %1734 = torch.prims.convert_element_type %1733, %int15_1474 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %1734, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %1735 = torch.aten.mul.Tensor %62, %1734 : !torch.vtensor<[4096],bf16>, !torch.vtensor<[1,?,4096],bf16> -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %1735, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %1736 = torch.aten.div.Tensor %1735, %63 : !torch.vtensor<[1,?,4096],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %1736, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %float-2.400000e02_1475 = torch.constant.float -2.400000e+02
    %float2.400000e02_1476 = torch.constant.float 2.400000e+02
    %1737 = torch.aten.clamp %1736, %float-2.400000e02_1475, %float2.400000e02_1476 : !torch.vtensor<[1,?,4096],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %1737, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int26_1477 = torch.constant.int 26
    %1738 = torch.prims.convert_element_type %1737, %int26_1477 : !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1738, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_1478 = torch.constant.int -2
    %int-1_1479 = torch.constant.int -1
    %1739 = torch.aten.transpose.int %64, %int-2_1478, %int-1_1479 : !torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %int4096_1480 = torch.constant.int 4096
    %1740 = torch.prim.ListConstruct %564, %int4096_1480 : (!torch.int, !torch.int) -> !torch.list<int>
    %1741 = torch.aten.view %1738, %1740 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1741, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %1742 = torch.aten.mm %1741, %1739 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,14336],f8E4M3FNUZ> -> !torch.vtensor<[?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1742, [%548], affine_map<()[s0] -> (s0 * 32, 14336)> : !torch.vtensor<[?,14336],f8E4M3FNUZ>
    %int1_1481 = torch.constant.int 1
    %int14336_1482 = torch.constant.int 14336
    %1743 = torch.prim.ListConstruct %int1_1481, %564, %int14336_1482 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1744 = torch.aten.view %1742, %1743 : !torch.vtensor<[?,14336],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1744, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    %int15_1483 = torch.constant.int 15
    %1745 = torch.prims.convert_element_type %1744, %int15_1483 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %1745, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %1746 = torch.aten.silu %1745 : !torch.vtensor<[1,?,14336],bf16> -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %1746, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %1747 = torch.aten.div.Tensor %1735, %65 : !torch.vtensor<[1,?,4096],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %1747, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %float-2.400000e02_1484 = torch.constant.float -2.400000e+02
    %float2.400000e02_1485 = torch.constant.float 2.400000e+02
    %1748 = torch.aten.clamp %1747, %float-2.400000e02_1484, %float2.400000e02_1485 : !torch.vtensor<[1,?,4096],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %1748, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int26_1486 = torch.constant.int 26
    %1749 = torch.prims.convert_element_type %1748, %int26_1486 : !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1749, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_1487 = torch.constant.int -2
    %int-1_1488 = torch.constant.int -1
    %1750 = torch.aten.transpose.int %66, %int-2_1487, %int-1_1488 : !torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %int4096_1489 = torch.constant.int 4096
    %1751 = torch.prim.ListConstruct %564, %int4096_1489 : (!torch.int, !torch.int) -> !torch.list<int>
    %1752 = torch.aten.view %1749, %1751 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1752, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %1753 = torch.aten.mm %1752, %1750 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,14336],f8E4M3FNUZ> -> !torch.vtensor<[?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1753, [%548], affine_map<()[s0] -> (s0 * 32, 14336)> : !torch.vtensor<[?,14336],f8E4M3FNUZ>
    %int1_1490 = torch.constant.int 1
    %int14336_1491 = torch.constant.int 14336
    %1754 = torch.prim.ListConstruct %int1_1490, %564, %int14336_1491 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1755 = torch.aten.view %1753, %1754 : !torch.vtensor<[?,14336],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1755, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    %int15_1492 = torch.constant.int 15
    %1756 = torch.prims.convert_element_type %1755, %int15_1492 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %1756, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %1757 = torch.aten.mul.Tensor %1746, %1756 : !torch.vtensor<[1,?,14336],bf16>, !torch.vtensor<[1,?,14336],bf16> -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %1757, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %1758 = torch.aten.div.Tensor %1757, %67 : !torch.vtensor<[1,?,14336],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %1758, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %float-2.400000e02_1493 = torch.constant.float -2.400000e+02
    %float2.400000e02_1494 = torch.constant.float 2.400000e+02
    %1759 = torch.aten.clamp %1758, %float-2.400000e02_1493, %float2.400000e02_1494 : !torch.vtensor<[1,?,14336],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %1759, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %int26_1495 = torch.constant.int 26
    %1760 = torch.prims.convert_element_type %1759, %int26_1495 : !torch.vtensor<[1,?,14336],bf16>, !torch.int -> !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1760, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    %int-2_1496 = torch.constant.int -2
    %int-1_1497 = torch.constant.int -1
    %1761 = torch.aten.transpose.int %68, %int-2_1496, %int-1_1497 : !torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %int1_1498 = torch.constant.int 1
    %1762 = torch.aten.size.int %1744, %int1_1498 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.int -> !torch.int
    %int14336_1499 = torch.constant.int 14336
    %1763 = torch.prim.ListConstruct %1762, %int14336_1499 : (!torch.int, !torch.int) -> !torch.list<int>
    %1764 = torch.aten.view %1760, %1763 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1764, [%548], affine_map<()[s0] -> (s0 * 32, 14336)> : !torch.vtensor<[?,14336],f8E4M3FNUZ>
    %1765 = torch.aten.mm %1764, %1761 : !torch.vtensor<[?,14336],f8E4M3FNUZ>, !torch.vtensor<[14336,4096],f8E4M3FNUZ> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1765, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %int1_1500 = torch.constant.int 1
    %int4096_1501 = torch.constant.int 4096
    %1766 = torch.prim.ListConstruct %int1_1500, %1762, %int4096_1501 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1767 = torch.aten.view %1765, %1766 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1767, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int15_1502 = torch.constant.int 15
    %1768 = torch.prims.convert_element_type %1767, %int15_1502 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %1768, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int1_1503 = torch.constant.int 1
    %1769 = torch.aten.add.Tensor %1726, %1768, %int1_1503 : !torch.vtensor<[1,?,4096],bf16>, !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %1769, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int6_1504 = torch.constant.int 6
    %1770 = torch.prims.convert_element_type %1769, %int6_1504 : !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %1770, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int2_1505 = torch.constant.int 2
    %1771 = torch.aten.pow.Tensor_Scalar %1770, %int2_1505 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %1771, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int-1_1506 = torch.constant.int -1
    %1772 = torch.prim.ListConstruct %int-1_1506 : (!torch.int) -> !torch.list<int>
    %true_1507 = torch.constant.bool true
    %none_1508 = torch.constant.none
    %1773 = torch.aten.mean.dim %1771, %1772, %true_1507, %none_1508 : !torch.vtensor<[1,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %1773, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %float1.000000e-05_1509 = torch.constant.float 1.000000e-05
    %int1_1510 = torch.constant.int 1
    %1774 = torch.aten.add.Scalar %1773, %float1.000000e-05_1509, %int1_1510 : !torch.vtensor<[1,?,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %1774, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %1775 = torch.aten.rsqrt %1774 : !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %1775, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %1776 = torch.aten.mul.Tensor %1770, %1775 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %1776, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int15_1511 = torch.constant.int 15
    %1777 = torch.prims.convert_element_type %1776, %int15_1511 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %1777, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %1778 = torch.aten.mul.Tensor %69, %1777 : !torch.vtensor<[4096],bf16>, !torch.vtensor<[1,?,4096],bf16> -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %1778, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %1779 = torch.aten.div.Tensor %1778, %70 : !torch.vtensor<[1,?,4096],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %1779, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %float-2.400000e02_1512 = torch.constant.float -2.400000e+02
    %float2.400000e02_1513 = torch.constant.float 2.400000e+02
    %1780 = torch.aten.clamp %1779, %float-2.400000e02_1512, %float2.400000e02_1513 : !torch.vtensor<[1,?,4096],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %1780, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int26_1514 = torch.constant.int 26
    %1781 = torch.prims.convert_element_type %1780, %int26_1514 : !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1781, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_1515 = torch.constant.int -2
    %int-1_1516 = torch.constant.int -1
    %1782 = torch.aten.transpose.int %71, %int-2_1515, %int-1_1516 : !torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %int4096_1517 = torch.constant.int 4096
    %1783 = torch.prim.ListConstruct %564, %int4096_1517 : (!torch.int, !torch.int) -> !torch.list<int>
    %1784 = torch.aten.view %1781, %1783 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1784, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %1785 = torch.aten.mm %1784, %1782 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1785, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %int1_1518 = torch.constant.int 1
    %int4096_1519 = torch.constant.int 4096
    %1786 = torch.prim.ListConstruct %int1_1518, %564, %int4096_1519 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1787 = torch.aten.view %1785, %1786 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1787, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int15_1520 = torch.constant.int 15
    %1788 = torch.prims.convert_element_type %1787, %int15_1520 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %1788, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %1789 = torch.aten.div.Tensor %1778, %72 : !torch.vtensor<[1,?,4096],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %1789, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %float-2.400000e02_1521 = torch.constant.float -2.400000e+02
    %float2.400000e02_1522 = torch.constant.float 2.400000e+02
    %1790 = torch.aten.clamp %1789, %float-2.400000e02_1521, %float2.400000e02_1522 : !torch.vtensor<[1,?,4096],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %1790, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int26_1523 = torch.constant.int 26
    %1791 = torch.prims.convert_element_type %1790, %int26_1523 : !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1791, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_1524 = torch.constant.int -2
    %int-1_1525 = torch.constant.int -1
    %1792 = torch.aten.transpose.int %73, %int-2_1524, %int-1_1525 : !torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,1024],f8E4M3FNUZ>
    %int4096_1526 = torch.constant.int 4096
    %1793 = torch.prim.ListConstruct %564, %int4096_1526 : (!torch.int, !torch.int) -> !torch.list<int>
    %1794 = torch.aten.view %1791, %1793 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1794, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %1795 = torch.aten.mm %1794, %1792 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,1024],f8E4M3FNUZ> -> !torch.vtensor<[?,1024],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1795, [%548], affine_map<()[s0] -> (s0 * 32, 1024)> : !torch.vtensor<[?,1024],f8E4M3FNUZ>
    %int1_1527 = torch.constant.int 1
    %int1024_1528 = torch.constant.int 1024
    %1796 = torch.prim.ListConstruct %int1_1527, %564, %int1024_1528 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1797 = torch.aten.view %1795, %1796 : !torch.vtensor<[?,1024],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1797, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
    %int15_1529 = torch.constant.int 15
    %1798 = torch.prims.convert_element_type %1797, %int15_1529 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,1024],bf16>
    torch.bind_symbolic_shape %1798, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],bf16>
    %1799 = torch.aten.div.Tensor %1778, %74 : !torch.vtensor<[1,?,4096],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %1799, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %float-2.400000e02_1530 = torch.constant.float -2.400000e+02
    %float2.400000e02_1531 = torch.constant.float 2.400000e+02
    %1800 = torch.aten.clamp %1799, %float-2.400000e02_1530, %float2.400000e02_1531 : !torch.vtensor<[1,?,4096],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %1800, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int26_1532 = torch.constant.int 26
    %1801 = torch.prims.convert_element_type %1800, %int26_1532 : !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1801, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_1533 = torch.constant.int -2
    %int-1_1534 = torch.constant.int -1
    %1802 = torch.aten.transpose.int %75, %int-2_1533, %int-1_1534 : !torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,1024],f8E4M3FNUZ>
    %int4096_1535 = torch.constant.int 4096
    %1803 = torch.prim.ListConstruct %564, %int4096_1535 : (!torch.int, !torch.int) -> !torch.list<int>
    %1804 = torch.aten.view %1801, %1803 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1804, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %1805 = torch.aten.mm %1804, %1802 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,1024],f8E4M3FNUZ> -> !torch.vtensor<[?,1024],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1805, [%548], affine_map<()[s0] -> (s0 * 32, 1024)> : !torch.vtensor<[?,1024],f8E4M3FNUZ>
    %int1_1536 = torch.constant.int 1
    %int1024_1537 = torch.constant.int 1024
    %1806 = torch.prim.ListConstruct %int1_1536, %564, %int1024_1537 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1807 = torch.aten.view %1805, %1806 : !torch.vtensor<[?,1024],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1807, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
    %int15_1538 = torch.constant.int 15
    %1808 = torch.prims.convert_element_type %1807, %int15_1538 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,1024],bf16>
    torch.bind_symbolic_shape %1808, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],bf16>
    %int1_1539 = torch.constant.int 1
    %int32_1540 = torch.constant.int 32
    %int128_1541 = torch.constant.int 128
    %1809 = torch.prim.ListConstruct %int1_1539, %564, %int32_1540, %int128_1541 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1810 = torch.aten.view %1788, %1809 : !torch.vtensor<[1,?,4096],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,32,128],bf16>
    torch.bind_symbolic_shape %1810, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
    %int1_1542 = torch.constant.int 1
    %int8_1543 = torch.constant.int 8
    %int128_1544 = torch.constant.int 128
    %1811 = torch.prim.ListConstruct %int1_1542, %564, %int8_1543, %int128_1544 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1812 = torch.aten.view %1798, %1811 : !torch.vtensor<[1,?,1024],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %1812, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %int1_1545 = torch.constant.int 1
    %int8_1546 = torch.constant.int 8
    %int128_1547 = torch.constant.int 128
    %1813 = torch.prim.ListConstruct %int1_1545, %564, %int8_1546, %int128_1547 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1814 = torch.aten.view %1808, %1813 : !torch.vtensor<[1,?,1024],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %1814, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %int131072_1548 = torch.constant.int 131072
    %none_1549 = torch.constant.none
    %none_1550 = torch.constant.none
    %cpu_1551 = torch.constant.device "cpu"
    %false_1552 = torch.constant.bool false
    %1815 = torch.aten.arange %int131072_1548, %none_1549, %none_1550, %cpu_1551, %false_1552 : !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[131072],si64>
    %int0_1553 = torch.constant.int 0
    %int128_1554 = torch.constant.int 128
    %int2_1555 = torch.constant.int 2
    %int4_1556 = torch.constant.int 4
    %none_1557 = torch.constant.none
    %cpu_1558 = torch.constant.device "cpu"
    %false_1559 = torch.constant.bool false
    %1816 = torch.aten.arange.start_step %int0_1553, %int128_1554, %int2_1555, %int4_1556, %none_1557, %cpu_1558, %false_1559 : !torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[64],si64>
    %int6_1560 = torch.constant.int 6
    %1817 = torch.prims.convert_element_type %1816, %int6_1560 : !torch.vtensor<[64],si64>, !torch.int -> !torch.vtensor<[64],f32>
    %int128_1561 = torch.constant.int 128
    %1818 = torch.aten.div.Scalar %1817, %int128_1561 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %float5.000000e05_1562 = torch.constant.float 5.000000e+05
    %1819 = torch.aten.pow.Scalar %float5.000000e05_1562, %1818 : !torch.float, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %1820 = torch.aten.reciprocal %1819 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %float1.000000e00_1563 = torch.constant.float 1.000000e+00
    %1821 = torch.aten.mul.Scalar %1820, %float1.000000e00_1563 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],f32>
    %1822 = torch.aten.reciprocal %1821 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %float6.283190e00_1564 = torch.constant.float 6.2831853071795862
    %1823 = torch.aten.mul.Scalar %1822, %float6.283190e00_1564 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],f32>
    %float8.192000e03_1565 = torch.constant.float 8.192000e+03
    %1824 = torch.aten.gt.Scalar %1823, %float8.192000e03_1565 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1>
    %int8_1566 = torch.constant.int 8
    %1825 = torch.aten.div.Scalar %1821, %int8_1566 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %1826 = torch.aten.where.self %1824, %1825, %1821 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %1827 = torch.aten.reciprocal %1823 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %int8192_1567 = torch.constant.int 8192
    %1828 = torch.aten.mul.Scalar %1827, %int8192_1567 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %int1_1568 = torch.constant.int 1
    %int1_1569 = torch.constant.int 1
    %1829 = torch.aten.sub.Scalar %1828, %int1_1568, %int1_1569 : !torch.vtensor<[64],f32>, !torch.int, !torch.int -> !torch.vtensor<[64],f32>
    %int3_1570 = torch.constant.int 3
    %1830 = torch.aten.div.Scalar %1829, %int3_1570 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %int1_1571 = torch.constant.int 1
    %int1_1572 = torch.constant.int 1
    %1831 = torch.aten.rsub.Scalar %1830, %int1_1571, %int1_1572 : !torch.vtensor<[64],f32>, !torch.int, !torch.int -> !torch.vtensor<[64],f32>
    %1832 = torch.aten.mul.Tensor %1831, %1826 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %int8_1573 = torch.constant.int 8
    %1833 = torch.aten.div.Scalar %1832, %int8_1573 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %1834 = torch.aten.mul.Tensor %1830, %1826 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %int1_1574 = torch.constant.int 1
    %1835 = torch.aten.add.Tensor %1833, %1834, %int1_1574 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %float2.048000e03_1575 = torch.constant.float 2.048000e+03
    %1836 = torch.aten.lt.Scalar %1823, %float2.048000e03_1575 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1>
    %1837 = torch.aten.bitwise_not %1836 : !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1>
    %float8.192000e03_1576 = torch.constant.float 8.192000e+03
    %1838 = torch.aten.gt.Scalar %1823, %float8.192000e03_1576 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1>
    %1839 = torch.aten.bitwise_not %1838 : !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1>
    %1840 = torch.aten.mul.Tensor %1837, %1839 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1>
    %1841 = torch.aten.where.self %1840, %1835, %1826 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %1842 = torch.prim.ListConstruct %1841, %1841 : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor>
    %int-1_1577 = torch.constant.int -1
    %1843 = torch.aten.cat %1842, %int-1_1577 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[128],f32>
    %int6_1578 = torch.constant.int 6
    %1844 = torch.prims.convert_element_type %1815, %int6_1578 : !torch.vtensor<[131072],si64>, !torch.int -> !torch.vtensor<[131072],f32>
    %int131072_1579 = torch.constant.int 131072
    %int1_1580 = torch.constant.int 1
    %1845 = torch.prim.ListConstruct %int131072_1579, %int1_1580 : (!torch.int, !torch.int) -> !torch.list<int>
    %1846 = torch.aten.view %1844, %1845 : !torch.vtensor<[131072],f32>, !torch.list<int> -> !torch.vtensor<[131072,1],f32>
    %1847 = torch.aten.mul.Tensor %1846, %1843 : !torch.vtensor<[131072,1],f32>, !torch.vtensor<[128],f32> -> !torch.vtensor<[131072,128],f32>
    %1848 = torch.aten.cos %1847 : !torch.vtensor<[131072,128],f32> -> !torch.vtensor<[131072,128],f32>
    %int15_1581 = torch.constant.int 15
    %1849 = torch.prims.convert_element_type %1848, %int15_1581 : !torch.vtensor<[131072,128],f32>, !torch.int -> !torch.vtensor<[131072,128],bf16>
    %1850 = torch.aten.sin %1847 : !torch.vtensor<[131072,128],f32> -> !torch.vtensor<[131072,128],f32>
    %int15_1582 = torch.constant.int 15
    %1851 = torch.prims.convert_element_type %1850, %int15_1582 : !torch.vtensor<[131072,128],f32>, !torch.int -> !torch.vtensor<[131072,128],bf16>
    %int1_1583 = torch.constant.int 1
    %1852 = torch.aten.size.int %1787, %int1_1583 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.int
    %int0_1584 = torch.constant.int 0
    %1853 = torch.aten.add.int %int0_1584, %1852 : !torch.int, !torch.int -> !torch.int
    %int0_1585 = torch.constant.int 0
    %int0_1586 = torch.constant.int 0
    %int1_1587 = torch.constant.int 1
    %1854 = torch.aten.slice.Tensor %1849, %int0_1585, %int0_1586, %1853, %int1_1587 : !torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %1854, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int1_1588 = torch.constant.int 1
    %int0_1589 = torch.constant.int 0
    %int9223372036854775807_1590 = torch.constant.int 9223372036854775807
    %int1_1591 = torch.constant.int 1
    %1855 = torch.aten.slice.Tensor %1854, %int1_1588, %int0_1589, %int9223372036854775807_1590, %int1_1591 : !torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %1855, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int0_1592 = torch.constant.int 0
    %1856 = torch.aten.add.int %int0_1592, %1852 : !torch.int, !torch.int -> !torch.int
    %int0_1593 = torch.constant.int 0
    %int0_1594 = torch.constant.int 0
    %int1_1595 = torch.constant.int 1
    %1857 = torch.aten.slice.Tensor %1851, %int0_1593, %int0_1594, %1856, %int1_1595 : !torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %1857, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int1_1596 = torch.constant.int 1
    %int0_1597 = torch.constant.int 0
    %int9223372036854775807_1598 = torch.constant.int 9223372036854775807
    %int1_1599 = torch.constant.int 1
    %1858 = torch.aten.slice.Tensor %1857, %int1_1596, %int0_1597, %int9223372036854775807_1598, %int1_1599 : !torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %1858, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int0_1600 = torch.constant.int 0
    %1859 = torch.aten.unsqueeze %1855, %int0_1600 : !torch.vtensor<[?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %1859, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int1_1601 = torch.constant.int 1
    %int0_1602 = torch.constant.int 0
    %int9223372036854775807_1603 = torch.constant.int 9223372036854775807
    %int1_1604 = torch.constant.int 1
    %1860 = torch.aten.slice.Tensor %1859, %int1_1601, %int0_1602, %int9223372036854775807_1603, %int1_1604 : !torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %1860, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int2_1605 = torch.constant.int 2
    %1861 = torch.aten.unsqueeze %1860, %int2_1605 : !torch.vtensor<[1,?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %1861, [%548], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int3_1606 = torch.constant.int 3
    %int0_1607 = torch.constant.int 0
    %int9223372036854775807_1608 = torch.constant.int 9223372036854775807
    %int1_1609 = torch.constant.int 1
    %1862 = torch.aten.slice.Tensor %1861, %int3_1606, %int0_1607, %int9223372036854775807_1608, %int1_1609 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %1862, [%548], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int0_1610 = torch.constant.int 0
    %1863 = torch.aten.unsqueeze %1858, %int0_1610 : !torch.vtensor<[?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %1863, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int1_1611 = torch.constant.int 1
    %int0_1612 = torch.constant.int 0
    %int9223372036854775807_1613 = torch.constant.int 9223372036854775807
    %int1_1614 = torch.constant.int 1
    %1864 = torch.aten.slice.Tensor %1863, %int1_1611, %int0_1612, %int9223372036854775807_1613, %int1_1614 : !torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %1864, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int2_1615 = torch.constant.int 2
    %1865 = torch.aten.unsqueeze %1864, %int2_1615 : !torch.vtensor<[1,?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %1865, [%548], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int3_1616 = torch.constant.int 3
    %int0_1617 = torch.constant.int 0
    %int9223372036854775807_1618 = torch.constant.int 9223372036854775807
    %int1_1619 = torch.constant.int 1
    %1866 = torch.aten.slice.Tensor %1865, %int3_1616, %int0_1617, %int9223372036854775807_1618, %int1_1619 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %1866, [%548], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int1_1620 = torch.constant.int 1
    %int2_1621 = torch.constant.int 2
    %1867 = torch.aten.transpose.int %1862, %int1_1620, %int2_1621 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,?,128],bf16>
    torch.bind_symbolic_shape %1867, [%548], affine_map<()[s0] -> (1, 1, s0 * 32, 128)> : !torch.vtensor<[1,1,?,128],bf16>
    %int1_1622 = torch.constant.int 1
    %int1_1623 = torch.constant.int 1
    %int1_1624 = torch.constant.int 1
    %int1_1625 = torch.constant.int 1
    %1868 = torch.prim.ListConstruct %int1_1622, %int1_1623, %int1_1624, %int1_1625 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1869 = torch.aten.repeat %1867, %1868 : !torch.vtensor<[1,1,?,128],bf16>, !torch.list<int> -> !torch.vtensor<[1,1,?,128],bf16>
    torch.bind_symbolic_shape %1869, [%548], affine_map<()[s0] -> (1, 1, s0 * 32, 128)> : !torch.vtensor<[1,1,?,128],bf16>
    %int1_1626 = torch.constant.int 1
    %int2_1627 = torch.constant.int 2
    %1870 = torch.aten.transpose.int %1866, %int1_1626, %int2_1627 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,?,128],bf16>
    torch.bind_symbolic_shape %1870, [%548], affine_map<()[s0] -> (1, 1, s0 * 32, 128)> : !torch.vtensor<[1,1,?,128],bf16>
    %int1_1628 = torch.constant.int 1
    %int2_1629 = torch.constant.int 2
    %1871 = torch.aten.transpose.int %1810, %int1_1628, %int2_1629 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
    torch.bind_symbolic_shape %1871, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %int1_1630 = torch.constant.int 1
    %int1_1631 = torch.constant.int 1
    %int1_1632 = torch.constant.int 1
    %int1_1633 = torch.constant.int 1
    %1872 = torch.prim.ListConstruct %int1_1630, %int1_1631, %int1_1632, %int1_1633 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1873 = torch.aten.repeat %1870, %1872 : !torch.vtensor<[1,1,?,128],bf16>, !torch.list<int> -> !torch.vtensor<[1,1,?,128],bf16>
    torch.bind_symbolic_shape %1873, [%548], affine_map<()[s0] -> (1, 1, s0 * 32, 128)> : !torch.vtensor<[1,1,?,128],bf16>
    %1874 = torch.aten.mul.Tensor %1871, %1869 : !torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,1,?,128],bf16> -> !torch.vtensor<[1,32,?,128],bf16>
    torch.bind_symbolic_shape %1874, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %int3_1634 = torch.constant.int 3
    %int0_1635 = torch.constant.int 0
    %int64_1636 = torch.constant.int 64
    %int1_1637 = torch.constant.int 1
    %1875 = torch.aten.slice.Tensor %1871, %int3_1634, %int0_1635, %int64_1636, %int1_1637 : !torch.vtensor<[1,32,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,64],bf16>
    torch.bind_symbolic_shape %1875, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 64)> : !torch.vtensor<[1,32,?,64],bf16>
    %int3_1638 = torch.constant.int 3
    %int64_1639 = torch.constant.int 64
    %int9223372036854775807_1640 = torch.constant.int 9223372036854775807
    %int1_1641 = torch.constant.int 1
    %1876 = torch.aten.slice.Tensor %1871, %int3_1638, %int64_1639, %int9223372036854775807_1640, %int1_1641 : !torch.vtensor<[1,32,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,64],bf16>
    torch.bind_symbolic_shape %1876, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 64)> : !torch.vtensor<[1,32,?,64],bf16>
    %1877 = torch.aten.neg %1876 : !torch.vtensor<[1,32,?,64],bf16> -> !torch.vtensor<[1,32,?,64],bf16>
    torch.bind_symbolic_shape %1877, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 64)> : !torch.vtensor<[1,32,?,64],bf16>
    %1878 = torch.prim.ListConstruct %1877, %1875 : (!torch.vtensor<[1,32,?,64],bf16>, !torch.vtensor<[1,32,?,64],bf16>) -> !torch.list<vtensor>
    %int-1_1642 = torch.constant.int -1
    %1879 = torch.aten.cat %1878, %int-1_1642 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
    torch.bind_symbolic_shape %1879, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %1880 = torch.aten.mul.Tensor %1879, %1873 : !torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,1,?,128],bf16> -> !torch.vtensor<[1,32,?,128],bf16>
    torch.bind_symbolic_shape %1880, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %int1_1643 = torch.constant.int 1
    %1881 = torch.aten.add.Tensor %1874, %1880, %int1_1643 : !torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,32,?,128],bf16>, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
    torch.bind_symbolic_shape %1881, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %int1_1644 = torch.constant.int 1
    %int2_1645 = torch.constant.int 2
    %1882 = torch.aten.transpose.int %1881, %int1_1644, %int2_1645 : !torch.vtensor<[1,32,?,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
    torch.bind_symbolic_shape %1882, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
    %int131072_1646 = torch.constant.int 131072
    %none_1647 = torch.constant.none
    %none_1648 = torch.constant.none
    %cpu_1649 = torch.constant.device "cpu"
    %false_1650 = torch.constant.bool false
    %1883 = torch.aten.arange %int131072_1646, %none_1647, %none_1648, %cpu_1649, %false_1650 : !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[131072],si64>
    %int0_1651 = torch.constant.int 0
    %int128_1652 = torch.constant.int 128
    %int2_1653 = torch.constant.int 2
    %int4_1654 = torch.constant.int 4
    %none_1655 = torch.constant.none
    %cpu_1656 = torch.constant.device "cpu"
    %false_1657 = torch.constant.bool false
    %1884 = torch.aten.arange.start_step %int0_1651, %int128_1652, %int2_1653, %int4_1654, %none_1655, %cpu_1656, %false_1657 : !torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[64],si64>
    %int6_1658 = torch.constant.int 6
    %1885 = torch.prims.convert_element_type %1884, %int6_1658 : !torch.vtensor<[64],si64>, !torch.int -> !torch.vtensor<[64],f32>
    %int128_1659 = torch.constant.int 128
    %1886 = torch.aten.div.Scalar %1885, %int128_1659 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %float5.000000e05_1660 = torch.constant.float 5.000000e+05
    %1887 = torch.aten.pow.Scalar %float5.000000e05_1660, %1886 : !torch.float, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %1888 = torch.aten.reciprocal %1887 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %float1.000000e00_1661 = torch.constant.float 1.000000e+00
    %1889 = torch.aten.mul.Scalar %1888, %float1.000000e00_1661 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],f32>
    %1890 = torch.aten.reciprocal %1889 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %float6.283190e00_1662 = torch.constant.float 6.2831853071795862
    %1891 = torch.aten.mul.Scalar %1890, %float6.283190e00_1662 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],f32>
    %float8.192000e03_1663 = torch.constant.float 8.192000e+03
    %1892 = torch.aten.gt.Scalar %1891, %float8.192000e03_1663 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1>
    %int8_1664 = torch.constant.int 8
    %1893 = torch.aten.div.Scalar %1889, %int8_1664 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %1894 = torch.aten.where.self %1892, %1893, %1889 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %1895 = torch.aten.reciprocal %1891 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %int8192_1665 = torch.constant.int 8192
    %1896 = torch.aten.mul.Scalar %1895, %int8192_1665 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %int1_1666 = torch.constant.int 1
    %int1_1667 = torch.constant.int 1
    %1897 = torch.aten.sub.Scalar %1896, %int1_1666, %int1_1667 : !torch.vtensor<[64],f32>, !torch.int, !torch.int -> !torch.vtensor<[64],f32>
    %int3_1668 = torch.constant.int 3
    %1898 = torch.aten.div.Scalar %1897, %int3_1668 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %int1_1669 = torch.constant.int 1
    %int1_1670 = torch.constant.int 1
    %1899 = torch.aten.rsub.Scalar %1898, %int1_1669, %int1_1670 : !torch.vtensor<[64],f32>, !torch.int, !torch.int -> !torch.vtensor<[64],f32>
    %1900 = torch.aten.mul.Tensor %1899, %1894 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %int8_1671 = torch.constant.int 8
    %1901 = torch.aten.div.Scalar %1900, %int8_1671 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %1902 = torch.aten.mul.Tensor %1898, %1894 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %int1_1672 = torch.constant.int 1
    %1903 = torch.aten.add.Tensor %1901, %1902, %int1_1672 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %float2.048000e03_1673 = torch.constant.float 2.048000e+03
    %1904 = torch.aten.lt.Scalar %1891, %float2.048000e03_1673 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1>
    %1905 = torch.aten.bitwise_not %1904 : !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1>
    %float8.192000e03_1674 = torch.constant.float 8.192000e+03
    %1906 = torch.aten.gt.Scalar %1891, %float8.192000e03_1674 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1>
    %1907 = torch.aten.bitwise_not %1906 : !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1>
    %1908 = torch.aten.mul.Tensor %1905, %1907 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1>
    %1909 = torch.aten.where.self %1908, %1903, %1894 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %1910 = torch.prim.ListConstruct %1909, %1909 : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor>
    %int-1_1675 = torch.constant.int -1
    %1911 = torch.aten.cat %1910, %int-1_1675 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[128],f32>
    %int6_1676 = torch.constant.int 6
    %1912 = torch.prims.convert_element_type %1883, %int6_1676 : !torch.vtensor<[131072],si64>, !torch.int -> !torch.vtensor<[131072],f32>
    %int131072_1677 = torch.constant.int 131072
    %int1_1678 = torch.constant.int 1
    %1913 = torch.prim.ListConstruct %int131072_1677, %int1_1678 : (!torch.int, !torch.int) -> !torch.list<int>
    %1914 = torch.aten.view %1912, %1913 : !torch.vtensor<[131072],f32>, !torch.list<int> -> !torch.vtensor<[131072,1],f32>
    %1915 = torch.aten.mul.Tensor %1914, %1911 : !torch.vtensor<[131072,1],f32>, !torch.vtensor<[128],f32> -> !torch.vtensor<[131072,128],f32>
    %1916 = torch.aten.cos %1915 : !torch.vtensor<[131072,128],f32> -> !torch.vtensor<[131072,128],f32>
    %int15_1679 = torch.constant.int 15
    %1917 = torch.prims.convert_element_type %1916, %int15_1679 : !torch.vtensor<[131072,128],f32>, !torch.int -> !torch.vtensor<[131072,128],bf16>
    %1918 = torch.aten.sin %1915 : !torch.vtensor<[131072,128],f32> -> !torch.vtensor<[131072,128],f32>
    %int15_1680 = torch.constant.int 15
    %1919 = torch.prims.convert_element_type %1918, %int15_1680 : !torch.vtensor<[131072,128],f32>, !torch.int -> !torch.vtensor<[131072,128],bf16>
    %int1_1681 = torch.constant.int 1
    %1920 = torch.aten.size.int %1797, %int1_1681 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.int
    %int0_1682 = torch.constant.int 0
    %1921 = torch.aten.add.int %int0_1682, %1920 : !torch.int, !torch.int -> !torch.int
    %int0_1683 = torch.constant.int 0
    %int0_1684 = torch.constant.int 0
    %int1_1685 = torch.constant.int 1
    %1922 = torch.aten.slice.Tensor %1917, %int0_1683, %int0_1684, %1921, %int1_1685 : !torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %1922, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int1_1686 = torch.constant.int 1
    %int0_1687 = torch.constant.int 0
    %int9223372036854775807_1688 = torch.constant.int 9223372036854775807
    %int1_1689 = torch.constant.int 1
    %1923 = torch.aten.slice.Tensor %1922, %int1_1686, %int0_1687, %int9223372036854775807_1688, %int1_1689 : !torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %1923, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int0_1690 = torch.constant.int 0
    %1924 = torch.aten.add.int %int0_1690, %1920 : !torch.int, !torch.int -> !torch.int
    %int0_1691 = torch.constant.int 0
    %int0_1692 = torch.constant.int 0
    %int1_1693 = torch.constant.int 1
    %1925 = torch.aten.slice.Tensor %1919, %int0_1691, %int0_1692, %1924, %int1_1693 : !torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %1925, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int1_1694 = torch.constant.int 1
    %int0_1695 = torch.constant.int 0
    %int9223372036854775807_1696 = torch.constant.int 9223372036854775807
    %int1_1697 = torch.constant.int 1
    %1926 = torch.aten.slice.Tensor %1925, %int1_1694, %int0_1695, %int9223372036854775807_1696, %int1_1697 : !torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %1926, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int0_1698 = torch.constant.int 0
    %1927 = torch.aten.unsqueeze %1923, %int0_1698 : !torch.vtensor<[?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %1927, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int1_1699 = torch.constant.int 1
    %int0_1700 = torch.constant.int 0
    %int9223372036854775807_1701 = torch.constant.int 9223372036854775807
    %int1_1702 = torch.constant.int 1
    %1928 = torch.aten.slice.Tensor %1927, %int1_1699, %int0_1700, %int9223372036854775807_1701, %int1_1702 : !torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %1928, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int2_1703 = torch.constant.int 2
    %1929 = torch.aten.unsqueeze %1928, %int2_1703 : !torch.vtensor<[1,?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %1929, [%548], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int3_1704 = torch.constant.int 3
    %int0_1705 = torch.constant.int 0
    %int9223372036854775807_1706 = torch.constant.int 9223372036854775807
    %int1_1707 = torch.constant.int 1
    %1930 = torch.aten.slice.Tensor %1929, %int3_1704, %int0_1705, %int9223372036854775807_1706, %int1_1707 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %1930, [%548], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int0_1708 = torch.constant.int 0
    %1931 = torch.aten.unsqueeze %1926, %int0_1708 : !torch.vtensor<[?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %1931, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int1_1709 = torch.constant.int 1
    %int0_1710 = torch.constant.int 0
    %int9223372036854775807_1711 = torch.constant.int 9223372036854775807
    %int1_1712 = torch.constant.int 1
    %1932 = torch.aten.slice.Tensor %1931, %int1_1709, %int0_1710, %int9223372036854775807_1711, %int1_1712 : !torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %1932, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int2_1713 = torch.constant.int 2
    %1933 = torch.aten.unsqueeze %1932, %int2_1713 : !torch.vtensor<[1,?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %1933, [%548], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int3_1714 = torch.constant.int 3
    %int0_1715 = torch.constant.int 0
    %int9223372036854775807_1716 = torch.constant.int 9223372036854775807
    %int1_1717 = torch.constant.int 1
    %1934 = torch.aten.slice.Tensor %1933, %int3_1714, %int0_1715, %int9223372036854775807_1716, %int1_1717 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %1934, [%548], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int1_1718 = torch.constant.int 1
    %int2_1719 = torch.constant.int 2
    %1935 = torch.aten.transpose.int %1930, %int1_1718, %int2_1719 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,?,128],bf16>
    torch.bind_symbolic_shape %1935, [%548], affine_map<()[s0] -> (1, 1, s0 * 32, 128)> : !torch.vtensor<[1,1,?,128],bf16>
    %int1_1720 = torch.constant.int 1
    %int1_1721 = torch.constant.int 1
    %int1_1722 = torch.constant.int 1
    %int1_1723 = torch.constant.int 1
    %1936 = torch.prim.ListConstruct %int1_1720, %int1_1721, %int1_1722, %int1_1723 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1937 = torch.aten.repeat %1935, %1936 : !torch.vtensor<[1,1,?,128],bf16>, !torch.list<int> -> !torch.vtensor<[1,1,?,128],bf16>
    torch.bind_symbolic_shape %1937, [%548], affine_map<()[s0] -> (1, 1, s0 * 32, 128)> : !torch.vtensor<[1,1,?,128],bf16>
    %int1_1724 = torch.constant.int 1
    %int2_1725 = torch.constant.int 2
    %1938 = torch.aten.transpose.int %1934, %int1_1724, %int2_1725 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,?,128],bf16>
    torch.bind_symbolic_shape %1938, [%548], affine_map<()[s0] -> (1, 1, s0 * 32, 128)> : !torch.vtensor<[1,1,?,128],bf16>
    %int1_1726 = torch.constant.int 1
    %int2_1727 = torch.constant.int 2
    %1939 = torch.aten.transpose.int %1812, %int1_1726, %int2_1727 : !torch.vtensor<[1,?,8,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,8,?,128],bf16>
    torch.bind_symbolic_shape %1939, [%548], affine_map<()[s0] -> (1, 8, s0 * 32, 128)> : !torch.vtensor<[1,8,?,128],bf16>
    %int1_1728 = torch.constant.int 1
    %int1_1729 = torch.constant.int 1
    %int1_1730 = torch.constant.int 1
    %int1_1731 = torch.constant.int 1
    %1940 = torch.prim.ListConstruct %int1_1728, %int1_1729, %int1_1730, %int1_1731 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1941 = torch.aten.repeat %1938, %1940 : !torch.vtensor<[1,1,?,128],bf16>, !torch.list<int> -> !torch.vtensor<[1,1,?,128],bf16>
    torch.bind_symbolic_shape %1941, [%548], affine_map<()[s0] -> (1, 1, s0 * 32, 128)> : !torch.vtensor<[1,1,?,128],bf16>
    %1942 = torch.aten.mul.Tensor %1939, %1937 : !torch.vtensor<[1,8,?,128],bf16>, !torch.vtensor<[1,1,?,128],bf16> -> !torch.vtensor<[1,8,?,128],bf16>
    torch.bind_symbolic_shape %1942, [%548], affine_map<()[s0] -> (1, 8, s0 * 32, 128)> : !torch.vtensor<[1,8,?,128],bf16>
    %int3_1732 = torch.constant.int 3
    %int0_1733 = torch.constant.int 0
    %int64_1734 = torch.constant.int 64
    %int1_1735 = torch.constant.int 1
    %1943 = torch.aten.slice.Tensor %1939, %int3_1732, %int0_1733, %int64_1734, %int1_1735 : !torch.vtensor<[1,8,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,8,?,64],bf16>
    torch.bind_symbolic_shape %1943, [%548], affine_map<()[s0] -> (1, 8, s0 * 32, 64)> : !torch.vtensor<[1,8,?,64],bf16>
    %int3_1736 = torch.constant.int 3
    %int64_1737 = torch.constant.int 64
    %int9223372036854775807_1738 = torch.constant.int 9223372036854775807
    %int1_1739 = torch.constant.int 1
    %1944 = torch.aten.slice.Tensor %1939, %int3_1736, %int64_1737, %int9223372036854775807_1738, %int1_1739 : !torch.vtensor<[1,8,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,8,?,64],bf16>
    torch.bind_symbolic_shape %1944, [%548], affine_map<()[s0] -> (1, 8, s0 * 32, 64)> : !torch.vtensor<[1,8,?,64],bf16>
    %1945 = torch.aten.neg %1944 : !torch.vtensor<[1,8,?,64],bf16> -> !torch.vtensor<[1,8,?,64],bf16>
    torch.bind_symbolic_shape %1945, [%548], affine_map<()[s0] -> (1, 8, s0 * 32, 64)> : !torch.vtensor<[1,8,?,64],bf16>
    %1946 = torch.prim.ListConstruct %1945, %1943 : (!torch.vtensor<[1,8,?,64],bf16>, !torch.vtensor<[1,8,?,64],bf16>) -> !torch.list<vtensor>
    %int-1_1740 = torch.constant.int -1
    %1947 = torch.aten.cat %1946, %int-1_1740 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,8,?,128],bf16>
    torch.bind_symbolic_shape %1947, [%548], affine_map<()[s0] -> (1, 8, s0 * 32, 128)> : !torch.vtensor<[1,8,?,128],bf16>
    %1948 = torch.aten.mul.Tensor %1947, %1941 : !torch.vtensor<[1,8,?,128],bf16>, !torch.vtensor<[1,1,?,128],bf16> -> !torch.vtensor<[1,8,?,128],bf16>
    torch.bind_symbolic_shape %1948, [%548], affine_map<()[s0] -> (1, 8, s0 * 32, 128)> : !torch.vtensor<[1,8,?,128],bf16>
    %int1_1741 = torch.constant.int 1
    %1949 = torch.aten.add.Tensor %1942, %1948, %int1_1741 : !torch.vtensor<[1,8,?,128],bf16>, !torch.vtensor<[1,8,?,128],bf16>, !torch.int -> !torch.vtensor<[1,8,?,128],bf16>
    torch.bind_symbolic_shape %1949, [%548], affine_map<()[s0] -> (1, 8, s0 * 32, 128)> : !torch.vtensor<[1,8,?,128],bf16>
    %int1_1742 = torch.constant.int 1
    %int2_1743 = torch.constant.int 2
    %1950 = torch.aten.transpose.int %1949, %int1_1742, %int2_1743 : !torch.vtensor<[1,8,?,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %1950, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %1951 = torch.aten.div.Tensor %1950, %76 : !torch.vtensor<[1,?,8,128],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %1951, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %float-2.400000e02_1744 = torch.constant.float -2.400000e+02
    %float2.400000e02_1745 = torch.constant.float 2.400000e+02
    %1952 = torch.aten.clamp %1951, %float-2.400000e02_1744, %float2.400000e02_1745 : !torch.vtensor<[1,?,8,128],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %1952, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %int26_1746 = torch.constant.int 26
    %1953 = torch.prims.convert_element_type %1952, %int26_1746 : !torch.vtensor<[1,?,8,128],bf16>, !torch.int -> !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1953, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
    %1954 = torch.aten.div.Tensor %1814, %76 : !torch.vtensor<[1,?,8,128],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %1954, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %float-2.400000e02_1747 = torch.constant.float -2.400000e+02
    %float2.400000e02_1748 = torch.constant.float 2.400000e+02
    %1955 = torch.aten.clamp %1954, %float-2.400000e02_1747, %float2.400000e02_1748 : !torch.vtensor<[1,?,8,128],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %1955, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %int26_1749 = torch.constant.int 26
    %1956 = torch.prims.convert_element_type %1955, %int26_1749 : !torch.vtensor<[1,?,8,128],bf16>, !torch.int -> !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1956, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
    %int64_1750 = torch.constant.int 64
    %1957 = torch.aten.mul.Scalar %arg2, %int64_1750 : !torch.vtensor<[1,?],si64>, !torch.int -> !torch.vtensor<[1,?],si64>
    torch.bind_symbolic_shape %1957, [%548], affine_map<()[s0] -> (1, s0)> : !torch.vtensor<[1,?],si64>
    %int8_1751 = torch.constant.int 8
    %int1_1752 = torch.constant.int 1
    %1958 = torch.aten.add.Scalar %1957, %int8_1751, %int1_1752 : !torch.vtensor<[1,?],si64>, !torch.int, !torch.int -> !torch.vtensor<[1,?],si64>
    torch.bind_symbolic_shape %1958, [%548], affine_map<()[s0] -> (1, s0)> : !torch.vtensor<[1,?],si64>
    %int1_1753 = torch.constant.int 1
    %int32_1754 = torch.constant.int 32
    %int8_1755 = torch.constant.int 8
    %int128_1756 = torch.constant.int 128
    %1959 = torch.prim.ListConstruct %int1_1753, %748, %int32_1754, %int8_1755, %int128_1756 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1960 = torch.aten.view %1953, %1959 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1960, [%548], affine_map<()[s0] -> (1, s0, 32, 8, 128)> : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
    %int32_1757 = torch.constant.int 32
    %int8_1758 = torch.constant.int 8
    %int128_1759 = torch.constant.int 128
    %1961 = torch.prim.ListConstruct %748, %int32_1757, %int8_1758, %int128_1759 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1962 = torch.aten.view %1960, %1961 : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1962, [%548], affine_map<()[s0] -> (s0, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %1963 = torch.prim.ListConstruct %748 : (!torch.int) -> !torch.list<int>
    %1964 = torch.aten.view %1958, %1963 : !torch.vtensor<[1,?],si64>, !torch.list<int> -> !torch.vtensor<[?],si64>
    torch.bind_symbolic_shape %1964, [%548], affine_map<()[s0] -> (s0)> : !torch.vtensor<[?],si64>
    %int32_1760 = torch.constant.int 32
    %int2_1761 = torch.constant.int 2
    %int32_1762 = torch.constant.int 32
    %int8_1763 = torch.constant.int 8
    %int128_1764 = torch.constant.int 128
    %1965 = torch.prim.ListConstruct %739, %int32_1760, %int2_1761, %int32_1762, %int8_1763, %int128_1764 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1966 = torch.aten.view %1689, %1965 : !torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1966, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    %int32_1765 = torch.constant.int 32
    %1967 = torch.aten.mul.int %739, %int32_1765 : !torch.int, !torch.int -> !torch.int
    %int2_1766 = torch.constant.int 2
    %1968 = torch.aten.mul.int %1967, %int2_1766 : !torch.int, !torch.int -> !torch.int
    %int32_1767 = torch.constant.int 32
    %int8_1768 = torch.constant.int 8
    %int128_1769 = torch.constant.int 128
    %1969 = torch.prim.ListConstruct %1968, %int32_1767, %int8_1768, %int128_1769 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1970 = torch.aten.view %1966, %1969 : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1970, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %1971 = torch.prim.ListConstruct %1964 : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %false_1770 = torch.constant.bool false
    %1972 = torch.aten.index_put %1970, %1971, %1962, %false_1770 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1972, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %int32_1771 = torch.constant.int 32
    %int2_1772 = torch.constant.int 2
    %int32_1773 = torch.constant.int 32
    %int8_1774 = torch.constant.int 8
    %int128_1775 = torch.constant.int 128
    %1973 = torch.prim.ListConstruct %739, %int32_1771, %int2_1772, %int32_1773, %int8_1774, %int128_1775 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1974 = torch.aten.view %1972, %1973 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1974, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    %int2097152_1776 = torch.constant.int 2097152
    %1975 = torch.prim.ListConstruct %739, %int2097152_1776 : (!torch.int, !torch.int) -> !torch.list<int>
    %1976 = torch.aten.view %1974, %1975 : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1976, [%549], affine_map<()[s0] -> (s0, 2097152)> : !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    %int32_1777 = torch.constant.int 32
    %int2_1778 = torch.constant.int 2
    %int32_1779 = torch.constant.int 32
    %int8_1780 = torch.constant.int 8
    %int128_1781 = torch.constant.int 128
    %1977 = torch.prim.ListConstruct %739, %int32_1777, %int2_1778, %int32_1779, %int8_1780, %int128_1781 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1978 = torch.aten.view %1976, %1977 : !torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1978, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    %int32_1782 = torch.constant.int 32
    %int8_1783 = torch.constant.int 8
    %int128_1784 = torch.constant.int 128
    %1979 = torch.prim.ListConstruct %1968, %int32_1782, %int8_1783, %int128_1784 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1980 = torch.aten.view %1978, %1979 : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1980, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %int1_1785 = torch.constant.int 1
    %int32_1786 = torch.constant.int 32
    %int8_1787 = torch.constant.int 8
    %int128_1788 = torch.constant.int 128
    %1981 = torch.prim.ListConstruct %int1_1785, %748, %int32_1786, %int8_1787, %int128_1788 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1982 = torch.aten.view %1956, %1981 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1982, [%548], affine_map<()[s0] -> (1, s0, 32, 8, 128)> : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
    %int32_1789 = torch.constant.int 32
    %int8_1790 = torch.constant.int 8
    %int128_1791 = torch.constant.int 128
    %1983 = torch.prim.ListConstruct %748, %int32_1789, %int8_1790, %int128_1791 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1984 = torch.aten.view %1982, %1983 : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1984, [%548], affine_map<()[s0] -> (s0, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %int1_1792 = torch.constant.int 1
    %int1_1793 = torch.constant.int 1
    %1985 = torch.aten.add.Scalar %1958, %int1_1792, %int1_1793 : !torch.vtensor<[1,?],si64>, !torch.int, !torch.int -> !torch.vtensor<[1,?],si64>
    torch.bind_symbolic_shape %1985, [%548], affine_map<()[s0] -> (1, s0)> : !torch.vtensor<[1,?],si64>
    %1986 = torch.prim.ListConstruct %748 : (!torch.int) -> !torch.list<int>
    %1987 = torch.aten.view %1985, %1986 : !torch.vtensor<[1,?],si64>, !torch.list<int> -> !torch.vtensor<[?],si64>
    torch.bind_symbolic_shape %1987, [%548], affine_map<()[s0] -> (s0)> : !torch.vtensor<[?],si64>
    %1988 = torch.prim.ListConstruct %1987 : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %false_1794 = torch.constant.bool false
    %1989 = torch.aten.index_put %1980, %1988, %1984, %false_1794 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1989, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %int32_1795 = torch.constant.int 32
    %int2_1796 = torch.constant.int 2
    %int32_1797 = torch.constant.int 32
    %int8_1798 = torch.constant.int 8
    %int128_1799 = torch.constant.int 128
    %1990 = torch.prim.ListConstruct %739, %int32_1795, %int2_1796, %int32_1797, %int8_1798, %int128_1799 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1991 = torch.aten.view %1989, %1990 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1991, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    %int2097152_1800 = torch.constant.int 2097152
    %1992 = torch.prim.ListConstruct %739, %int2097152_1800 : (!torch.int, !torch.int) -> !torch.list<int>
    %1993 = torch.aten.view %1991, %1992 : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1993, [%549], affine_map<()[s0] -> (s0, 2097152)> : !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    %int-2_1801 = torch.constant.int -2
    %1994 = torch.aten.unsqueeze %1953, %int-2_1801 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1994, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 1, 128)> : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
    %int1_1802 = torch.constant.int 1
    %int8_1803 = torch.constant.int 8
    %int4_1804 = torch.constant.int 4
    %int128_1805 = torch.constant.int 128
    %1995 = torch.prim.ListConstruct %int1_1802, %1920, %int8_1803, %int4_1804, %int128_1805 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %false_1806 = torch.constant.bool false
    %1996 = torch.aten.expand %1994, %1995, %false_1806 : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1996, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    %int0_1807 = torch.constant.int 0
    %1997 = torch.aten.clone %1996, %int0_1807 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1997, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    %int1_1808 = torch.constant.int 1
    %int32_1809 = torch.constant.int 32
    %int128_1810 = torch.constant.int 128
    %1998 = torch.prim.ListConstruct %int1_1808, %1920, %int32_1809, %int128_1810 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1999 = torch.aten._unsafe_view %1997, %1998 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %1999, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
    %int-2_1811 = torch.constant.int -2
    %2000 = torch.aten.unsqueeze %1956, %int-2_1811 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2000, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 1, 128)> : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
    %int1_1812 = torch.constant.int 1
    %2001 = torch.aten.size.int %1807, %int1_1812 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.int
    %int1_1813 = torch.constant.int 1
    %int8_1814 = torch.constant.int 8
    %int4_1815 = torch.constant.int 4
    %int128_1816 = torch.constant.int 128
    %2002 = torch.prim.ListConstruct %int1_1813, %2001, %int8_1814, %int4_1815, %int128_1816 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %false_1817 = torch.constant.bool false
    %2003 = torch.aten.expand %2000, %2002, %false_1817 : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2003, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    %int0_1818 = torch.constant.int 0
    %2004 = torch.aten.clone %2003, %int0_1818 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2004, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    %int1_1819 = torch.constant.int 1
    %int32_1820 = torch.constant.int 32
    %int128_1821 = torch.constant.int 128
    %2005 = torch.prim.ListConstruct %int1_1819, %2001, %int32_1820, %int128_1821 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2006 = torch.aten._unsafe_view %2004, %2005 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2006, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
    %int6_1822 = torch.constant.int 6
    %2007 = torch.prims.convert_element_type %1999, %int6_1822 : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,32,128],f32>
    torch.bind_symbolic_shape %2007, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
    %2008 = torch.aten.mul.Tensor %2007, %76 : !torch.vtensor<[1,?,32,128],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,32,128],f32>
    torch.bind_symbolic_shape %2008, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
    %int15_1823 = torch.constant.int 15
    %2009 = torch.prims.convert_element_type %2008, %int15_1823 : !torch.vtensor<[1,?,32,128],f32>, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
    torch.bind_symbolic_shape %2009, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
    %int6_1824 = torch.constant.int 6
    %2010 = torch.prims.convert_element_type %2006, %int6_1824 : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,32,128],f32>
    torch.bind_symbolic_shape %2010, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
    %2011 = torch.aten.mul.Tensor %2010, %76 : !torch.vtensor<[1,?,32,128],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,32,128],f32>
    torch.bind_symbolic_shape %2011, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
    %int15_1825 = torch.constant.int 15
    %2012 = torch.prims.convert_element_type %2011, %int15_1825 : !torch.vtensor<[1,?,32,128],f32>, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
    torch.bind_symbolic_shape %2012, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
    %int1_1826 = torch.constant.int 1
    %int2_1827 = torch.constant.int 2
    %2013 = torch.aten.transpose.int %1882, %int1_1826, %int2_1827 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
    torch.bind_symbolic_shape %2013, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %int1_1828 = torch.constant.int 1
    %int2_1829 = torch.constant.int 2
    %2014 = torch.aten.transpose.int %2009, %int1_1828, %int2_1829 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
    torch.bind_symbolic_shape %2014, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %int1_1830 = torch.constant.int 1
    %int2_1831 = torch.constant.int 2
    %2015 = torch.aten.transpose.int %2012, %int1_1830, %int2_1831 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
    torch.bind_symbolic_shape %2015, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %float0.000000e00_1832 = torch.constant.float 0.000000e+00
    %true_1833 = torch.constant.bool true
    %none_1834 = torch.constant.none
    %none_1835 = torch.constant.none
    %2016:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%2013, %2014, %2015, %float0.000000e00_1832, %true_1833, %none_1834, %none_1835) : (!torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,32,?,128],bf16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,32,?],f32>) 
    torch.bind_symbolic_shape %2016#0, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %int1_1836 = torch.constant.int 1
    %int2_1837 = torch.constant.int 2
    %2017 = torch.aten.transpose.int %2016#0, %int1_1836, %int2_1837 : !torch.vtensor<[1,32,?,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
    torch.bind_symbolic_shape %2017, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
    %int1_1838 = torch.constant.int 1
    %int4096_1839 = torch.constant.int 4096
    %2018 = torch.prim.ListConstruct %int1_1838, %1852, %int4096_1839 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2019 = torch.aten.view %2017, %2018 : !torch.vtensor<[1,?,32,128],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2019, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %2020 = torch.aten.div.Tensor %2019, %77 : !torch.vtensor<[1,?,4096],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2020, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %float-2.400000e02_1840 = torch.constant.float -2.400000e+02
    %float2.400000e02_1841 = torch.constant.float 2.400000e+02
    %2021 = torch.aten.clamp %2020, %float-2.400000e02_1840, %float2.400000e02_1841 : !torch.vtensor<[1,?,4096],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2021, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int26_1842 = torch.constant.int 26
    %2022 = torch.prims.convert_element_type %2021, %int26_1842 : !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2022, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_1843 = torch.constant.int -2
    %int-1_1844 = torch.constant.int -1
    %2023 = torch.aten.transpose.int %78, %int-2_1843, %int-1_1844 : !torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %int4096_1845 = torch.constant.int 4096
    %2024 = torch.prim.ListConstruct %1852, %int4096_1845 : (!torch.int, !torch.int) -> !torch.list<int>
    %2025 = torch.aten.view %2022, %2024 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2025, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %2026 = torch.aten.mm %2025, %2023 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2026, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %int1_1846 = torch.constant.int 1
    %int4096_1847 = torch.constant.int 4096
    %2027 = torch.prim.ListConstruct %int1_1846, %1852, %int4096_1847 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2028 = torch.aten.view %2026, %2027 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2028, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int15_1848 = torch.constant.int 15
    %2029 = torch.prims.convert_element_type %2028, %int15_1848 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2029, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int1_1849 = torch.constant.int 1
    %2030 = torch.aten.add.Tensor %1769, %2029, %int1_1849 : !torch.vtensor<[1,?,4096],bf16>, !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2030, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int6_1850 = torch.constant.int 6
    %2031 = torch.prims.convert_element_type %2030, %int6_1850 : !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %2031, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int2_1851 = torch.constant.int 2
    %2032 = torch.aten.pow.Tensor_Scalar %2031, %int2_1851 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %2032, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int-1_1852 = torch.constant.int -1
    %2033 = torch.prim.ListConstruct %int-1_1852 : (!torch.int) -> !torch.list<int>
    %true_1853 = torch.constant.bool true
    %none_1854 = torch.constant.none
    %2034 = torch.aten.mean.dim %2032, %2033, %true_1853, %none_1854 : !torch.vtensor<[1,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %2034, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %float1.000000e-05_1855 = torch.constant.float 1.000000e-05
    %int1_1856 = torch.constant.int 1
    %2035 = torch.aten.add.Scalar %2034, %float1.000000e-05_1855, %int1_1856 : !torch.vtensor<[1,?,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %2035, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %2036 = torch.aten.rsqrt %2035 : !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %2036, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %2037 = torch.aten.mul.Tensor %2031, %2036 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %2037, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int15_1857 = torch.constant.int 15
    %2038 = torch.prims.convert_element_type %2037, %int15_1857 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2038, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %2039 = torch.aten.mul.Tensor %79, %2038 : !torch.vtensor<[4096],bf16>, !torch.vtensor<[1,?,4096],bf16> -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2039, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %2040 = torch.aten.div.Tensor %2039, %80 : !torch.vtensor<[1,?,4096],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2040, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %float-2.400000e02_1858 = torch.constant.float -2.400000e+02
    %float2.400000e02_1859 = torch.constant.float 2.400000e+02
    %2041 = torch.aten.clamp %2040, %float-2.400000e02_1858, %float2.400000e02_1859 : !torch.vtensor<[1,?,4096],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2041, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int26_1860 = torch.constant.int 26
    %2042 = torch.prims.convert_element_type %2041, %int26_1860 : !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2042, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_1861 = torch.constant.int -2
    %int-1_1862 = torch.constant.int -1
    %2043 = torch.aten.transpose.int %81, %int-2_1861, %int-1_1862 : !torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %int4096_1863 = torch.constant.int 4096
    %2044 = torch.prim.ListConstruct %564, %int4096_1863 : (!torch.int, !torch.int) -> !torch.list<int>
    %2045 = torch.aten.view %2042, %2044 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2045, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %2046 = torch.aten.mm %2045, %2043 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,14336],f8E4M3FNUZ> -> !torch.vtensor<[?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2046, [%548], affine_map<()[s0] -> (s0 * 32, 14336)> : !torch.vtensor<[?,14336],f8E4M3FNUZ>
    %int1_1864 = torch.constant.int 1
    %int14336_1865 = torch.constant.int 14336
    %2047 = torch.prim.ListConstruct %int1_1864, %564, %int14336_1865 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2048 = torch.aten.view %2046, %2047 : !torch.vtensor<[?,14336],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2048, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    %int15_1866 = torch.constant.int 15
    %2049 = torch.prims.convert_element_type %2048, %int15_1866 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %2049, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %2050 = torch.aten.silu %2049 : !torch.vtensor<[1,?,14336],bf16> -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %2050, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %2051 = torch.aten.div.Tensor %2039, %82 : !torch.vtensor<[1,?,4096],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2051, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %float-2.400000e02_1867 = torch.constant.float -2.400000e+02
    %float2.400000e02_1868 = torch.constant.float 2.400000e+02
    %2052 = torch.aten.clamp %2051, %float-2.400000e02_1867, %float2.400000e02_1868 : !torch.vtensor<[1,?,4096],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2052, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int26_1869 = torch.constant.int 26
    %2053 = torch.prims.convert_element_type %2052, %int26_1869 : !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2053, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_1870 = torch.constant.int -2
    %int-1_1871 = torch.constant.int -1
    %2054 = torch.aten.transpose.int %83, %int-2_1870, %int-1_1871 : !torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %int4096_1872 = torch.constant.int 4096
    %2055 = torch.prim.ListConstruct %564, %int4096_1872 : (!torch.int, !torch.int) -> !torch.list<int>
    %2056 = torch.aten.view %2053, %2055 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2056, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %2057 = torch.aten.mm %2056, %2054 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,14336],f8E4M3FNUZ> -> !torch.vtensor<[?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2057, [%548], affine_map<()[s0] -> (s0 * 32, 14336)> : !torch.vtensor<[?,14336],f8E4M3FNUZ>
    %int1_1873 = torch.constant.int 1
    %int14336_1874 = torch.constant.int 14336
    %2058 = torch.prim.ListConstruct %int1_1873, %564, %int14336_1874 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2059 = torch.aten.view %2057, %2058 : !torch.vtensor<[?,14336],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2059, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    %int15_1875 = torch.constant.int 15
    %2060 = torch.prims.convert_element_type %2059, %int15_1875 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %2060, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %2061 = torch.aten.mul.Tensor %2050, %2060 : !torch.vtensor<[1,?,14336],bf16>, !torch.vtensor<[1,?,14336],bf16> -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %2061, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %2062 = torch.aten.div.Tensor %2061, %84 : !torch.vtensor<[1,?,14336],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %2062, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %float-2.400000e02_1876 = torch.constant.float -2.400000e+02
    %float2.400000e02_1877 = torch.constant.float 2.400000e+02
    %2063 = torch.aten.clamp %2062, %float-2.400000e02_1876, %float2.400000e02_1877 : !torch.vtensor<[1,?,14336],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %2063, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %int26_1878 = torch.constant.int 26
    %2064 = torch.prims.convert_element_type %2063, %int26_1878 : !torch.vtensor<[1,?,14336],bf16>, !torch.int -> !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2064, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    %int-2_1879 = torch.constant.int -2
    %int-1_1880 = torch.constant.int -1
    %2065 = torch.aten.transpose.int %85, %int-2_1879, %int-1_1880 : !torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %int1_1881 = torch.constant.int 1
    %2066 = torch.aten.size.int %2048, %int1_1881 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.int -> !torch.int
    %int14336_1882 = torch.constant.int 14336
    %2067 = torch.prim.ListConstruct %2066, %int14336_1882 : (!torch.int, !torch.int) -> !torch.list<int>
    %2068 = torch.aten.view %2064, %2067 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2068, [%548], affine_map<()[s0] -> (s0 * 32, 14336)> : !torch.vtensor<[?,14336],f8E4M3FNUZ>
    %2069 = torch.aten.mm %2068, %2065 : !torch.vtensor<[?,14336],f8E4M3FNUZ>, !torch.vtensor<[14336,4096],f8E4M3FNUZ> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2069, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %int1_1883 = torch.constant.int 1
    %int4096_1884 = torch.constant.int 4096
    %2070 = torch.prim.ListConstruct %int1_1883, %2066, %int4096_1884 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2071 = torch.aten.view %2069, %2070 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2071, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int15_1885 = torch.constant.int 15
    %2072 = torch.prims.convert_element_type %2071, %int15_1885 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2072, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int1_1886 = torch.constant.int 1
    %2073 = torch.aten.add.Tensor %2030, %2072, %int1_1886 : !torch.vtensor<[1,?,4096],bf16>, !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2073, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int6_1887 = torch.constant.int 6
    %2074 = torch.prims.convert_element_type %2073, %int6_1887 : !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %2074, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int2_1888 = torch.constant.int 2
    %2075 = torch.aten.pow.Tensor_Scalar %2074, %int2_1888 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %2075, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int-1_1889 = torch.constant.int -1
    %2076 = torch.prim.ListConstruct %int-1_1889 : (!torch.int) -> !torch.list<int>
    %true_1890 = torch.constant.bool true
    %none_1891 = torch.constant.none
    %2077 = torch.aten.mean.dim %2075, %2076, %true_1890, %none_1891 : !torch.vtensor<[1,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %2077, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %float1.000000e-05_1892 = torch.constant.float 1.000000e-05
    %int1_1893 = torch.constant.int 1
    %2078 = torch.aten.add.Scalar %2077, %float1.000000e-05_1892, %int1_1893 : !torch.vtensor<[1,?,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %2078, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %2079 = torch.aten.rsqrt %2078 : !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %2079, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %2080 = torch.aten.mul.Tensor %2074, %2079 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %2080, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int15_1894 = torch.constant.int 15
    %2081 = torch.prims.convert_element_type %2080, %int15_1894 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2081, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %2082 = torch.aten.mul.Tensor %86, %2081 : !torch.vtensor<[4096],bf16>, !torch.vtensor<[1,?,4096],bf16> -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2082, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %2083 = torch.aten.div.Tensor %2082, %87 : !torch.vtensor<[1,?,4096],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2083, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %float-2.400000e02_1895 = torch.constant.float -2.400000e+02
    %float2.400000e02_1896 = torch.constant.float 2.400000e+02
    %2084 = torch.aten.clamp %2083, %float-2.400000e02_1895, %float2.400000e02_1896 : !torch.vtensor<[1,?,4096],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2084, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int26_1897 = torch.constant.int 26
    %2085 = torch.prims.convert_element_type %2084, %int26_1897 : !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2085, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_1898 = torch.constant.int -2
    %int-1_1899 = torch.constant.int -1
    %2086 = torch.aten.transpose.int %88, %int-2_1898, %int-1_1899 : !torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %int4096_1900 = torch.constant.int 4096
    %2087 = torch.prim.ListConstruct %564, %int4096_1900 : (!torch.int, !torch.int) -> !torch.list<int>
    %2088 = torch.aten.view %2085, %2087 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2088, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %2089 = torch.aten.mm %2088, %2086 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2089, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %int1_1901 = torch.constant.int 1
    %int4096_1902 = torch.constant.int 4096
    %2090 = torch.prim.ListConstruct %int1_1901, %564, %int4096_1902 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2091 = torch.aten.view %2089, %2090 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2091, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int15_1903 = torch.constant.int 15
    %2092 = torch.prims.convert_element_type %2091, %int15_1903 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2092, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %2093 = torch.aten.div.Tensor %2082, %89 : !torch.vtensor<[1,?,4096],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2093, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %float-2.400000e02_1904 = torch.constant.float -2.400000e+02
    %float2.400000e02_1905 = torch.constant.float 2.400000e+02
    %2094 = torch.aten.clamp %2093, %float-2.400000e02_1904, %float2.400000e02_1905 : !torch.vtensor<[1,?,4096],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2094, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int26_1906 = torch.constant.int 26
    %2095 = torch.prims.convert_element_type %2094, %int26_1906 : !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2095, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_1907 = torch.constant.int -2
    %int-1_1908 = torch.constant.int -1
    %2096 = torch.aten.transpose.int %90, %int-2_1907, %int-1_1908 : !torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,1024],f8E4M3FNUZ>
    %int4096_1909 = torch.constant.int 4096
    %2097 = torch.prim.ListConstruct %564, %int4096_1909 : (!torch.int, !torch.int) -> !torch.list<int>
    %2098 = torch.aten.view %2095, %2097 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2098, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %2099 = torch.aten.mm %2098, %2096 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,1024],f8E4M3FNUZ> -> !torch.vtensor<[?,1024],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2099, [%548], affine_map<()[s0] -> (s0 * 32, 1024)> : !torch.vtensor<[?,1024],f8E4M3FNUZ>
    %int1_1910 = torch.constant.int 1
    %int1024_1911 = torch.constant.int 1024
    %2100 = torch.prim.ListConstruct %int1_1910, %564, %int1024_1911 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2101 = torch.aten.view %2099, %2100 : !torch.vtensor<[?,1024],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2101, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
    %int15_1912 = torch.constant.int 15
    %2102 = torch.prims.convert_element_type %2101, %int15_1912 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,1024],bf16>
    torch.bind_symbolic_shape %2102, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],bf16>
    %2103 = torch.aten.div.Tensor %2082, %91 : !torch.vtensor<[1,?,4096],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2103, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %float-2.400000e02_1913 = torch.constant.float -2.400000e+02
    %float2.400000e02_1914 = torch.constant.float 2.400000e+02
    %2104 = torch.aten.clamp %2103, %float-2.400000e02_1913, %float2.400000e02_1914 : !torch.vtensor<[1,?,4096],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2104, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int26_1915 = torch.constant.int 26
    %2105 = torch.prims.convert_element_type %2104, %int26_1915 : !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2105, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_1916 = torch.constant.int -2
    %int-1_1917 = torch.constant.int -1
    %2106 = torch.aten.transpose.int %92, %int-2_1916, %int-1_1917 : !torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,1024],f8E4M3FNUZ>
    %int4096_1918 = torch.constant.int 4096
    %2107 = torch.prim.ListConstruct %564, %int4096_1918 : (!torch.int, !torch.int) -> !torch.list<int>
    %2108 = torch.aten.view %2105, %2107 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2108, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %2109 = torch.aten.mm %2108, %2106 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,1024],f8E4M3FNUZ> -> !torch.vtensor<[?,1024],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2109, [%548], affine_map<()[s0] -> (s0 * 32, 1024)> : !torch.vtensor<[?,1024],f8E4M3FNUZ>
    %int1_1919 = torch.constant.int 1
    %int1024_1920 = torch.constant.int 1024
    %2110 = torch.prim.ListConstruct %int1_1919, %564, %int1024_1920 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2111 = torch.aten.view %2109, %2110 : !torch.vtensor<[?,1024],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2111, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
    %int15_1921 = torch.constant.int 15
    %2112 = torch.prims.convert_element_type %2111, %int15_1921 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,1024],bf16>
    torch.bind_symbolic_shape %2112, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],bf16>
    %int1_1922 = torch.constant.int 1
    %int32_1923 = torch.constant.int 32
    %int128_1924 = torch.constant.int 128
    %2113 = torch.prim.ListConstruct %int1_1922, %564, %int32_1923, %int128_1924 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2114 = torch.aten.view %2092, %2113 : !torch.vtensor<[1,?,4096],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,32,128],bf16>
    torch.bind_symbolic_shape %2114, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
    %int1_1925 = torch.constant.int 1
    %int8_1926 = torch.constant.int 8
    %int128_1927 = torch.constant.int 128
    %2115 = torch.prim.ListConstruct %int1_1925, %564, %int8_1926, %int128_1927 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2116 = torch.aten.view %2102, %2115 : !torch.vtensor<[1,?,1024],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %2116, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %int1_1928 = torch.constant.int 1
    %int8_1929 = torch.constant.int 8
    %int128_1930 = torch.constant.int 128
    %2117 = torch.prim.ListConstruct %int1_1928, %564, %int8_1929, %int128_1930 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2118 = torch.aten.view %2112, %2117 : !torch.vtensor<[1,?,1024],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %2118, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %int131072_1931 = torch.constant.int 131072
    %none_1932 = torch.constant.none
    %none_1933 = torch.constant.none
    %cpu_1934 = torch.constant.device "cpu"
    %false_1935 = torch.constant.bool false
    %2119 = torch.aten.arange %int131072_1931, %none_1932, %none_1933, %cpu_1934, %false_1935 : !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[131072],si64>
    %int0_1936 = torch.constant.int 0
    %int128_1937 = torch.constant.int 128
    %int2_1938 = torch.constant.int 2
    %int4_1939 = torch.constant.int 4
    %none_1940 = torch.constant.none
    %cpu_1941 = torch.constant.device "cpu"
    %false_1942 = torch.constant.bool false
    %2120 = torch.aten.arange.start_step %int0_1936, %int128_1937, %int2_1938, %int4_1939, %none_1940, %cpu_1941, %false_1942 : !torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[64],si64>
    %int6_1943 = torch.constant.int 6
    %2121 = torch.prims.convert_element_type %2120, %int6_1943 : !torch.vtensor<[64],si64>, !torch.int -> !torch.vtensor<[64],f32>
    %int128_1944 = torch.constant.int 128
    %2122 = torch.aten.div.Scalar %2121, %int128_1944 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %float5.000000e05_1945 = torch.constant.float 5.000000e+05
    %2123 = torch.aten.pow.Scalar %float5.000000e05_1945, %2122 : !torch.float, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %2124 = torch.aten.reciprocal %2123 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %float1.000000e00_1946 = torch.constant.float 1.000000e+00
    %2125 = torch.aten.mul.Scalar %2124, %float1.000000e00_1946 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],f32>
    %2126 = torch.aten.reciprocal %2125 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %float6.283190e00_1947 = torch.constant.float 6.2831853071795862
    %2127 = torch.aten.mul.Scalar %2126, %float6.283190e00_1947 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],f32>
    %float8.192000e03_1948 = torch.constant.float 8.192000e+03
    %2128 = torch.aten.gt.Scalar %2127, %float8.192000e03_1948 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1>
    %int8_1949 = torch.constant.int 8
    %2129 = torch.aten.div.Scalar %2125, %int8_1949 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %2130 = torch.aten.where.self %2128, %2129, %2125 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %2131 = torch.aten.reciprocal %2127 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %int8192_1950 = torch.constant.int 8192
    %2132 = torch.aten.mul.Scalar %2131, %int8192_1950 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %int1_1951 = torch.constant.int 1
    %int1_1952 = torch.constant.int 1
    %2133 = torch.aten.sub.Scalar %2132, %int1_1951, %int1_1952 : !torch.vtensor<[64],f32>, !torch.int, !torch.int -> !torch.vtensor<[64],f32>
    %int3_1953 = torch.constant.int 3
    %2134 = torch.aten.div.Scalar %2133, %int3_1953 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %int1_1954 = torch.constant.int 1
    %int1_1955 = torch.constant.int 1
    %2135 = torch.aten.rsub.Scalar %2134, %int1_1954, %int1_1955 : !torch.vtensor<[64],f32>, !torch.int, !torch.int -> !torch.vtensor<[64],f32>
    %2136 = torch.aten.mul.Tensor %2135, %2130 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %int8_1956 = torch.constant.int 8
    %2137 = torch.aten.div.Scalar %2136, %int8_1956 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %2138 = torch.aten.mul.Tensor %2134, %2130 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %int1_1957 = torch.constant.int 1
    %2139 = torch.aten.add.Tensor %2137, %2138, %int1_1957 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %float2.048000e03_1958 = torch.constant.float 2.048000e+03
    %2140 = torch.aten.lt.Scalar %2127, %float2.048000e03_1958 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1>
    %2141 = torch.aten.bitwise_not %2140 : !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1>
    %float8.192000e03_1959 = torch.constant.float 8.192000e+03
    %2142 = torch.aten.gt.Scalar %2127, %float8.192000e03_1959 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1>
    %2143 = torch.aten.bitwise_not %2142 : !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1>
    %2144 = torch.aten.mul.Tensor %2141, %2143 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1>
    %2145 = torch.aten.where.self %2144, %2139, %2130 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %2146 = torch.prim.ListConstruct %2145, %2145 : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor>
    %int-1_1960 = torch.constant.int -1
    %2147 = torch.aten.cat %2146, %int-1_1960 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[128],f32>
    %int6_1961 = torch.constant.int 6
    %2148 = torch.prims.convert_element_type %2119, %int6_1961 : !torch.vtensor<[131072],si64>, !torch.int -> !torch.vtensor<[131072],f32>
    %int131072_1962 = torch.constant.int 131072
    %int1_1963 = torch.constant.int 1
    %2149 = torch.prim.ListConstruct %int131072_1962, %int1_1963 : (!torch.int, !torch.int) -> !torch.list<int>
    %2150 = torch.aten.view %2148, %2149 : !torch.vtensor<[131072],f32>, !torch.list<int> -> !torch.vtensor<[131072,1],f32>
    %2151 = torch.aten.mul.Tensor %2150, %2147 : !torch.vtensor<[131072,1],f32>, !torch.vtensor<[128],f32> -> !torch.vtensor<[131072,128],f32>
    %2152 = torch.aten.cos %2151 : !torch.vtensor<[131072,128],f32> -> !torch.vtensor<[131072,128],f32>
    %int15_1964 = torch.constant.int 15
    %2153 = torch.prims.convert_element_type %2152, %int15_1964 : !torch.vtensor<[131072,128],f32>, !torch.int -> !torch.vtensor<[131072,128],bf16>
    %2154 = torch.aten.sin %2151 : !torch.vtensor<[131072,128],f32> -> !torch.vtensor<[131072,128],f32>
    %int15_1965 = torch.constant.int 15
    %2155 = torch.prims.convert_element_type %2154, %int15_1965 : !torch.vtensor<[131072,128],f32>, !torch.int -> !torch.vtensor<[131072,128],bf16>
    %int1_1966 = torch.constant.int 1
    %2156 = torch.aten.size.int %2091, %int1_1966 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.int
    %int0_1967 = torch.constant.int 0
    %2157 = torch.aten.add.int %int0_1967, %2156 : !torch.int, !torch.int -> !torch.int
    %int0_1968 = torch.constant.int 0
    %int0_1969 = torch.constant.int 0
    %int1_1970 = torch.constant.int 1
    %2158 = torch.aten.slice.Tensor %2153, %int0_1968, %int0_1969, %2157, %int1_1970 : !torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %2158, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int1_1971 = torch.constant.int 1
    %int0_1972 = torch.constant.int 0
    %int9223372036854775807_1973 = torch.constant.int 9223372036854775807
    %int1_1974 = torch.constant.int 1
    %2159 = torch.aten.slice.Tensor %2158, %int1_1971, %int0_1972, %int9223372036854775807_1973, %int1_1974 : !torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %2159, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int0_1975 = torch.constant.int 0
    %2160 = torch.aten.add.int %int0_1975, %2156 : !torch.int, !torch.int -> !torch.int
    %int0_1976 = torch.constant.int 0
    %int0_1977 = torch.constant.int 0
    %int1_1978 = torch.constant.int 1
    %2161 = torch.aten.slice.Tensor %2155, %int0_1976, %int0_1977, %2160, %int1_1978 : !torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %2161, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int1_1979 = torch.constant.int 1
    %int0_1980 = torch.constant.int 0
    %int9223372036854775807_1981 = torch.constant.int 9223372036854775807
    %int1_1982 = torch.constant.int 1
    %2162 = torch.aten.slice.Tensor %2161, %int1_1979, %int0_1980, %int9223372036854775807_1981, %int1_1982 : !torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %2162, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int0_1983 = torch.constant.int 0
    %2163 = torch.aten.unsqueeze %2159, %int0_1983 : !torch.vtensor<[?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %2163, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int1_1984 = torch.constant.int 1
    %int0_1985 = torch.constant.int 0
    %int9223372036854775807_1986 = torch.constant.int 9223372036854775807
    %int1_1987 = torch.constant.int 1
    %2164 = torch.aten.slice.Tensor %2163, %int1_1984, %int0_1985, %int9223372036854775807_1986, %int1_1987 : !torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %2164, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int2_1988 = torch.constant.int 2
    %2165 = torch.aten.unsqueeze %2164, %int2_1988 : !torch.vtensor<[1,?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %2165, [%548], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int3_1989 = torch.constant.int 3
    %int0_1990 = torch.constant.int 0
    %int9223372036854775807_1991 = torch.constant.int 9223372036854775807
    %int1_1992 = torch.constant.int 1
    %2166 = torch.aten.slice.Tensor %2165, %int3_1989, %int0_1990, %int9223372036854775807_1991, %int1_1992 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %2166, [%548], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int0_1993 = torch.constant.int 0
    %2167 = torch.aten.unsqueeze %2162, %int0_1993 : !torch.vtensor<[?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %2167, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int1_1994 = torch.constant.int 1
    %int0_1995 = torch.constant.int 0
    %int9223372036854775807_1996 = torch.constant.int 9223372036854775807
    %int1_1997 = torch.constant.int 1
    %2168 = torch.aten.slice.Tensor %2167, %int1_1994, %int0_1995, %int9223372036854775807_1996, %int1_1997 : !torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %2168, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int2_1998 = torch.constant.int 2
    %2169 = torch.aten.unsqueeze %2168, %int2_1998 : !torch.vtensor<[1,?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %2169, [%548], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int3_1999 = torch.constant.int 3
    %int0_2000 = torch.constant.int 0
    %int9223372036854775807_2001 = torch.constant.int 9223372036854775807
    %int1_2002 = torch.constant.int 1
    %2170 = torch.aten.slice.Tensor %2169, %int3_1999, %int0_2000, %int9223372036854775807_2001, %int1_2002 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %2170, [%548], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int1_2003 = torch.constant.int 1
    %int2_2004 = torch.constant.int 2
    %2171 = torch.aten.transpose.int %2166, %int1_2003, %int2_2004 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,?,128],bf16>
    torch.bind_symbolic_shape %2171, [%548], affine_map<()[s0] -> (1, 1, s0 * 32, 128)> : !torch.vtensor<[1,1,?,128],bf16>
    %int1_2005 = torch.constant.int 1
    %int1_2006 = torch.constant.int 1
    %int1_2007 = torch.constant.int 1
    %int1_2008 = torch.constant.int 1
    %2172 = torch.prim.ListConstruct %int1_2005, %int1_2006, %int1_2007, %int1_2008 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2173 = torch.aten.repeat %2171, %2172 : !torch.vtensor<[1,1,?,128],bf16>, !torch.list<int> -> !torch.vtensor<[1,1,?,128],bf16>
    torch.bind_symbolic_shape %2173, [%548], affine_map<()[s0] -> (1, 1, s0 * 32, 128)> : !torch.vtensor<[1,1,?,128],bf16>
    %int1_2009 = torch.constant.int 1
    %int2_2010 = torch.constant.int 2
    %2174 = torch.aten.transpose.int %2170, %int1_2009, %int2_2010 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,?,128],bf16>
    torch.bind_symbolic_shape %2174, [%548], affine_map<()[s0] -> (1, 1, s0 * 32, 128)> : !torch.vtensor<[1,1,?,128],bf16>
    %int1_2011 = torch.constant.int 1
    %int2_2012 = torch.constant.int 2
    %2175 = torch.aten.transpose.int %2114, %int1_2011, %int2_2012 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
    torch.bind_symbolic_shape %2175, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %int1_2013 = torch.constant.int 1
    %int1_2014 = torch.constant.int 1
    %int1_2015 = torch.constant.int 1
    %int1_2016 = torch.constant.int 1
    %2176 = torch.prim.ListConstruct %int1_2013, %int1_2014, %int1_2015, %int1_2016 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2177 = torch.aten.repeat %2174, %2176 : !torch.vtensor<[1,1,?,128],bf16>, !torch.list<int> -> !torch.vtensor<[1,1,?,128],bf16>
    torch.bind_symbolic_shape %2177, [%548], affine_map<()[s0] -> (1, 1, s0 * 32, 128)> : !torch.vtensor<[1,1,?,128],bf16>
    %2178 = torch.aten.mul.Tensor %2175, %2173 : !torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,1,?,128],bf16> -> !torch.vtensor<[1,32,?,128],bf16>
    torch.bind_symbolic_shape %2178, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %int3_2017 = torch.constant.int 3
    %int0_2018 = torch.constant.int 0
    %int64_2019 = torch.constant.int 64
    %int1_2020 = torch.constant.int 1
    %2179 = torch.aten.slice.Tensor %2175, %int3_2017, %int0_2018, %int64_2019, %int1_2020 : !torch.vtensor<[1,32,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,64],bf16>
    torch.bind_symbolic_shape %2179, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 64)> : !torch.vtensor<[1,32,?,64],bf16>
    %int3_2021 = torch.constant.int 3
    %int64_2022 = torch.constant.int 64
    %int9223372036854775807_2023 = torch.constant.int 9223372036854775807
    %int1_2024 = torch.constant.int 1
    %2180 = torch.aten.slice.Tensor %2175, %int3_2021, %int64_2022, %int9223372036854775807_2023, %int1_2024 : !torch.vtensor<[1,32,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,64],bf16>
    torch.bind_symbolic_shape %2180, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 64)> : !torch.vtensor<[1,32,?,64],bf16>
    %2181 = torch.aten.neg %2180 : !torch.vtensor<[1,32,?,64],bf16> -> !torch.vtensor<[1,32,?,64],bf16>
    torch.bind_symbolic_shape %2181, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 64)> : !torch.vtensor<[1,32,?,64],bf16>
    %2182 = torch.prim.ListConstruct %2181, %2179 : (!torch.vtensor<[1,32,?,64],bf16>, !torch.vtensor<[1,32,?,64],bf16>) -> !torch.list<vtensor>
    %int-1_2025 = torch.constant.int -1
    %2183 = torch.aten.cat %2182, %int-1_2025 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
    torch.bind_symbolic_shape %2183, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %2184 = torch.aten.mul.Tensor %2183, %2177 : !torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,1,?,128],bf16> -> !torch.vtensor<[1,32,?,128],bf16>
    torch.bind_symbolic_shape %2184, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %int1_2026 = torch.constant.int 1
    %2185 = torch.aten.add.Tensor %2178, %2184, %int1_2026 : !torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,32,?,128],bf16>, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
    torch.bind_symbolic_shape %2185, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %int1_2027 = torch.constant.int 1
    %int2_2028 = torch.constant.int 2
    %2186 = torch.aten.transpose.int %2185, %int1_2027, %int2_2028 : !torch.vtensor<[1,32,?,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
    torch.bind_symbolic_shape %2186, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
    %int131072_2029 = torch.constant.int 131072
    %none_2030 = torch.constant.none
    %none_2031 = torch.constant.none
    %cpu_2032 = torch.constant.device "cpu"
    %false_2033 = torch.constant.bool false
    %2187 = torch.aten.arange %int131072_2029, %none_2030, %none_2031, %cpu_2032, %false_2033 : !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[131072],si64>
    %int0_2034 = torch.constant.int 0
    %int128_2035 = torch.constant.int 128
    %int2_2036 = torch.constant.int 2
    %int4_2037 = torch.constant.int 4
    %none_2038 = torch.constant.none
    %cpu_2039 = torch.constant.device "cpu"
    %false_2040 = torch.constant.bool false
    %2188 = torch.aten.arange.start_step %int0_2034, %int128_2035, %int2_2036, %int4_2037, %none_2038, %cpu_2039, %false_2040 : !torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[64],si64>
    %int6_2041 = torch.constant.int 6
    %2189 = torch.prims.convert_element_type %2188, %int6_2041 : !torch.vtensor<[64],si64>, !torch.int -> !torch.vtensor<[64],f32>
    %int128_2042 = torch.constant.int 128
    %2190 = torch.aten.div.Scalar %2189, %int128_2042 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %float5.000000e05_2043 = torch.constant.float 5.000000e+05
    %2191 = torch.aten.pow.Scalar %float5.000000e05_2043, %2190 : !torch.float, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %2192 = torch.aten.reciprocal %2191 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %float1.000000e00_2044 = torch.constant.float 1.000000e+00
    %2193 = torch.aten.mul.Scalar %2192, %float1.000000e00_2044 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],f32>
    %2194 = torch.aten.reciprocal %2193 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %float6.283190e00_2045 = torch.constant.float 6.2831853071795862
    %2195 = torch.aten.mul.Scalar %2194, %float6.283190e00_2045 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],f32>
    %float8.192000e03_2046 = torch.constant.float 8.192000e+03
    %2196 = torch.aten.gt.Scalar %2195, %float8.192000e03_2046 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1>
    %int8_2047 = torch.constant.int 8
    %2197 = torch.aten.div.Scalar %2193, %int8_2047 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %2198 = torch.aten.where.self %2196, %2197, %2193 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %2199 = torch.aten.reciprocal %2195 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %int8192_2048 = torch.constant.int 8192
    %2200 = torch.aten.mul.Scalar %2199, %int8192_2048 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %int1_2049 = torch.constant.int 1
    %int1_2050 = torch.constant.int 1
    %2201 = torch.aten.sub.Scalar %2200, %int1_2049, %int1_2050 : !torch.vtensor<[64],f32>, !torch.int, !torch.int -> !torch.vtensor<[64],f32>
    %int3_2051 = torch.constant.int 3
    %2202 = torch.aten.div.Scalar %2201, %int3_2051 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %int1_2052 = torch.constant.int 1
    %int1_2053 = torch.constant.int 1
    %2203 = torch.aten.rsub.Scalar %2202, %int1_2052, %int1_2053 : !torch.vtensor<[64],f32>, !torch.int, !torch.int -> !torch.vtensor<[64],f32>
    %2204 = torch.aten.mul.Tensor %2203, %2198 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %int8_2054 = torch.constant.int 8
    %2205 = torch.aten.div.Scalar %2204, %int8_2054 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %2206 = torch.aten.mul.Tensor %2202, %2198 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %int1_2055 = torch.constant.int 1
    %2207 = torch.aten.add.Tensor %2205, %2206, %int1_2055 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %float2.048000e03_2056 = torch.constant.float 2.048000e+03
    %2208 = torch.aten.lt.Scalar %2195, %float2.048000e03_2056 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1>
    %2209 = torch.aten.bitwise_not %2208 : !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1>
    %float8.192000e03_2057 = torch.constant.float 8.192000e+03
    %2210 = torch.aten.gt.Scalar %2195, %float8.192000e03_2057 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1>
    %2211 = torch.aten.bitwise_not %2210 : !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1>
    %2212 = torch.aten.mul.Tensor %2209, %2211 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1>
    %2213 = torch.aten.where.self %2212, %2207, %2198 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %2214 = torch.prim.ListConstruct %2213, %2213 : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor>
    %int-1_2058 = torch.constant.int -1
    %2215 = torch.aten.cat %2214, %int-1_2058 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[128],f32>
    %int6_2059 = torch.constant.int 6
    %2216 = torch.prims.convert_element_type %2187, %int6_2059 : !torch.vtensor<[131072],si64>, !torch.int -> !torch.vtensor<[131072],f32>
    %int131072_2060 = torch.constant.int 131072
    %int1_2061 = torch.constant.int 1
    %2217 = torch.prim.ListConstruct %int131072_2060, %int1_2061 : (!torch.int, !torch.int) -> !torch.list<int>
    %2218 = torch.aten.view %2216, %2217 : !torch.vtensor<[131072],f32>, !torch.list<int> -> !torch.vtensor<[131072,1],f32>
    %2219 = torch.aten.mul.Tensor %2218, %2215 : !torch.vtensor<[131072,1],f32>, !torch.vtensor<[128],f32> -> !torch.vtensor<[131072,128],f32>
    %2220 = torch.aten.cos %2219 : !torch.vtensor<[131072,128],f32> -> !torch.vtensor<[131072,128],f32>
    %int15_2062 = torch.constant.int 15
    %2221 = torch.prims.convert_element_type %2220, %int15_2062 : !torch.vtensor<[131072,128],f32>, !torch.int -> !torch.vtensor<[131072,128],bf16>
    %2222 = torch.aten.sin %2219 : !torch.vtensor<[131072,128],f32> -> !torch.vtensor<[131072,128],f32>
    %int15_2063 = torch.constant.int 15
    %2223 = torch.prims.convert_element_type %2222, %int15_2063 : !torch.vtensor<[131072,128],f32>, !torch.int -> !torch.vtensor<[131072,128],bf16>
    %int1_2064 = torch.constant.int 1
    %2224 = torch.aten.size.int %2101, %int1_2064 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.int
    %int0_2065 = torch.constant.int 0
    %2225 = torch.aten.add.int %int0_2065, %2224 : !torch.int, !torch.int -> !torch.int
    %int0_2066 = torch.constant.int 0
    %int0_2067 = torch.constant.int 0
    %int1_2068 = torch.constant.int 1
    %2226 = torch.aten.slice.Tensor %2221, %int0_2066, %int0_2067, %2225, %int1_2068 : !torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %2226, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int1_2069 = torch.constant.int 1
    %int0_2070 = torch.constant.int 0
    %int9223372036854775807_2071 = torch.constant.int 9223372036854775807
    %int1_2072 = torch.constant.int 1
    %2227 = torch.aten.slice.Tensor %2226, %int1_2069, %int0_2070, %int9223372036854775807_2071, %int1_2072 : !torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %2227, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int0_2073 = torch.constant.int 0
    %2228 = torch.aten.add.int %int0_2073, %2224 : !torch.int, !torch.int -> !torch.int
    %int0_2074 = torch.constant.int 0
    %int0_2075 = torch.constant.int 0
    %int1_2076 = torch.constant.int 1
    %2229 = torch.aten.slice.Tensor %2223, %int0_2074, %int0_2075, %2228, %int1_2076 : !torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %2229, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int1_2077 = torch.constant.int 1
    %int0_2078 = torch.constant.int 0
    %int9223372036854775807_2079 = torch.constant.int 9223372036854775807
    %int1_2080 = torch.constant.int 1
    %2230 = torch.aten.slice.Tensor %2229, %int1_2077, %int0_2078, %int9223372036854775807_2079, %int1_2080 : !torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %2230, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int0_2081 = torch.constant.int 0
    %2231 = torch.aten.unsqueeze %2227, %int0_2081 : !torch.vtensor<[?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %2231, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int1_2082 = torch.constant.int 1
    %int0_2083 = torch.constant.int 0
    %int9223372036854775807_2084 = torch.constant.int 9223372036854775807
    %int1_2085 = torch.constant.int 1
    %2232 = torch.aten.slice.Tensor %2231, %int1_2082, %int0_2083, %int9223372036854775807_2084, %int1_2085 : !torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %2232, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int2_2086 = torch.constant.int 2
    %2233 = torch.aten.unsqueeze %2232, %int2_2086 : !torch.vtensor<[1,?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %2233, [%548], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int3_2087 = torch.constant.int 3
    %int0_2088 = torch.constant.int 0
    %int9223372036854775807_2089 = torch.constant.int 9223372036854775807
    %int1_2090 = torch.constant.int 1
    %2234 = torch.aten.slice.Tensor %2233, %int3_2087, %int0_2088, %int9223372036854775807_2089, %int1_2090 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %2234, [%548], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int0_2091 = torch.constant.int 0
    %2235 = torch.aten.unsqueeze %2230, %int0_2091 : !torch.vtensor<[?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %2235, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int1_2092 = torch.constant.int 1
    %int0_2093 = torch.constant.int 0
    %int9223372036854775807_2094 = torch.constant.int 9223372036854775807
    %int1_2095 = torch.constant.int 1
    %2236 = torch.aten.slice.Tensor %2235, %int1_2092, %int0_2093, %int9223372036854775807_2094, %int1_2095 : !torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %2236, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int2_2096 = torch.constant.int 2
    %2237 = torch.aten.unsqueeze %2236, %int2_2096 : !torch.vtensor<[1,?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %2237, [%548], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int3_2097 = torch.constant.int 3
    %int0_2098 = torch.constant.int 0
    %int9223372036854775807_2099 = torch.constant.int 9223372036854775807
    %int1_2100 = torch.constant.int 1
    %2238 = torch.aten.slice.Tensor %2237, %int3_2097, %int0_2098, %int9223372036854775807_2099, %int1_2100 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %2238, [%548], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int1_2101 = torch.constant.int 1
    %int2_2102 = torch.constant.int 2
    %2239 = torch.aten.transpose.int %2234, %int1_2101, %int2_2102 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,?,128],bf16>
    torch.bind_symbolic_shape %2239, [%548], affine_map<()[s0] -> (1, 1, s0 * 32, 128)> : !torch.vtensor<[1,1,?,128],bf16>
    %int1_2103 = torch.constant.int 1
    %int1_2104 = torch.constant.int 1
    %int1_2105 = torch.constant.int 1
    %int1_2106 = torch.constant.int 1
    %2240 = torch.prim.ListConstruct %int1_2103, %int1_2104, %int1_2105, %int1_2106 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2241 = torch.aten.repeat %2239, %2240 : !torch.vtensor<[1,1,?,128],bf16>, !torch.list<int> -> !torch.vtensor<[1,1,?,128],bf16>
    torch.bind_symbolic_shape %2241, [%548], affine_map<()[s0] -> (1, 1, s0 * 32, 128)> : !torch.vtensor<[1,1,?,128],bf16>
    %int1_2107 = torch.constant.int 1
    %int2_2108 = torch.constant.int 2
    %2242 = torch.aten.transpose.int %2238, %int1_2107, %int2_2108 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,?,128],bf16>
    torch.bind_symbolic_shape %2242, [%548], affine_map<()[s0] -> (1, 1, s0 * 32, 128)> : !torch.vtensor<[1,1,?,128],bf16>
    %int1_2109 = torch.constant.int 1
    %int2_2110 = torch.constant.int 2
    %2243 = torch.aten.transpose.int %2116, %int1_2109, %int2_2110 : !torch.vtensor<[1,?,8,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,8,?,128],bf16>
    torch.bind_symbolic_shape %2243, [%548], affine_map<()[s0] -> (1, 8, s0 * 32, 128)> : !torch.vtensor<[1,8,?,128],bf16>
    %int1_2111 = torch.constant.int 1
    %int1_2112 = torch.constant.int 1
    %int1_2113 = torch.constant.int 1
    %int1_2114 = torch.constant.int 1
    %2244 = torch.prim.ListConstruct %int1_2111, %int1_2112, %int1_2113, %int1_2114 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2245 = torch.aten.repeat %2242, %2244 : !torch.vtensor<[1,1,?,128],bf16>, !torch.list<int> -> !torch.vtensor<[1,1,?,128],bf16>
    torch.bind_symbolic_shape %2245, [%548], affine_map<()[s0] -> (1, 1, s0 * 32, 128)> : !torch.vtensor<[1,1,?,128],bf16>
    %2246 = torch.aten.mul.Tensor %2243, %2241 : !torch.vtensor<[1,8,?,128],bf16>, !torch.vtensor<[1,1,?,128],bf16> -> !torch.vtensor<[1,8,?,128],bf16>
    torch.bind_symbolic_shape %2246, [%548], affine_map<()[s0] -> (1, 8, s0 * 32, 128)> : !torch.vtensor<[1,8,?,128],bf16>
    %int3_2115 = torch.constant.int 3
    %int0_2116 = torch.constant.int 0
    %int64_2117 = torch.constant.int 64
    %int1_2118 = torch.constant.int 1
    %2247 = torch.aten.slice.Tensor %2243, %int3_2115, %int0_2116, %int64_2117, %int1_2118 : !torch.vtensor<[1,8,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,8,?,64],bf16>
    torch.bind_symbolic_shape %2247, [%548], affine_map<()[s0] -> (1, 8, s0 * 32, 64)> : !torch.vtensor<[1,8,?,64],bf16>
    %int3_2119 = torch.constant.int 3
    %int64_2120 = torch.constant.int 64
    %int9223372036854775807_2121 = torch.constant.int 9223372036854775807
    %int1_2122 = torch.constant.int 1
    %2248 = torch.aten.slice.Tensor %2243, %int3_2119, %int64_2120, %int9223372036854775807_2121, %int1_2122 : !torch.vtensor<[1,8,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,8,?,64],bf16>
    torch.bind_symbolic_shape %2248, [%548], affine_map<()[s0] -> (1, 8, s0 * 32, 64)> : !torch.vtensor<[1,8,?,64],bf16>
    %2249 = torch.aten.neg %2248 : !torch.vtensor<[1,8,?,64],bf16> -> !torch.vtensor<[1,8,?,64],bf16>
    torch.bind_symbolic_shape %2249, [%548], affine_map<()[s0] -> (1, 8, s0 * 32, 64)> : !torch.vtensor<[1,8,?,64],bf16>
    %2250 = torch.prim.ListConstruct %2249, %2247 : (!torch.vtensor<[1,8,?,64],bf16>, !torch.vtensor<[1,8,?,64],bf16>) -> !torch.list<vtensor>
    %int-1_2123 = torch.constant.int -1
    %2251 = torch.aten.cat %2250, %int-1_2123 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,8,?,128],bf16>
    torch.bind_symbolic_shape %2251, [%548], affine_map<()[s0] -> (1, 8, s0 * 32, 128)> : !torch.vtensor<[1,8,?,128],bf16>
    %2252 = torch.aten.mul.Tensor %2251, %2245 : !torch.vtensor<[1,8,?,128],bf16>, !torch.vtensor<[1,1,?,128],bf16> -> !torch.vtensor<[1,8,?,128],bf16>
    torch.bind_symbolic_shape %2252, [%548], affine_map<()[s0] -> (1, 8, s0 * 32, 128)> : !torch.vtensor<[1,8,?,128],bf16>
    %int1_2124 = torch.constant.int 1
    %2253 = torch.aten.add.Tensor %2246, %2252, %int1_2124 : !torch.vtensor<[1,8,?,128],bf16>, !torch.vtensor<[1,8,?,128],bf16>, !torch.int -> !torch.vtensor<[1,8,?,128],bf16>
    torch.bind_symbolic_shape %2253, [%548], affine_map<()[s0] -> (1, 8, s0 * 32, 128)> : !torch.vtensor<[1,8,?,128],bf16>
    %int1_2125 = torch.constant.int 1
    %int2_2126 = torch.constant.int 2
    %2254 = torch.aten.transpose.int %2253, %int1_2125, %int2_2126 : !torch.vtensor<[1,8,?,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %2254, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %2255 = torch.aten.div.Tensor %2254, %93 : !torch.vtensor<[1,?,8,128],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %2255, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %float-2.400000e02_2127 = torch.constant.float -2.400000e+02
    %float2.400000e02_2128 = torch.constant.float 2.400000e+02
    %2256 = torch.aten.clamp %2255, %float-2.400000e02_2127, %float2.400000e02_2128 : !torch.vtensor<[1,?,8,128],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %2256, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %int26_2129 = torch.constant.int 26
    %2257 = torch.prims.convert_element_type %2256, %int26_2129 : !torch.vtensor<[1,?,8,128],bf16>, !torch.int -> !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2257, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
    %2258 = torch.aten.div.Tensor %2118, %93 : !torch.vtensor<[1,?,8,128],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %2258, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %float-2.400000e02_2130 = torch.constant.float -2.400000e+02
    %float2.400000e02_2131 = torch.constant.float 2.400000e+02
    %2259 = torch.aten.clamp %2258, %float-2.400000e02_2130, %float2.400000e02_2131 : !torch.vtensor<[1,?,8,128],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %2259, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %int26_2132 = torch.constant.int 26
    %2260 = torch.prims.convert_element_type %2259, %int26_2132 : !torch.vtensor<[1,?,8,128],bf16>, !torch.int -> !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2260, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
    %int64_2133 = torch.constant.int 64
    %2261 = torch.aten.mul.Scalar %arg2, %int64_2133 : !torch.vtensor<[1,?],si64>, !torch.int -> !torch.vtensor<[1,?],si64>
    torch.bind_symbolic_shape %2261, [%548], affine_map<()[s0] -> (1, s0)> : !torch.vtensor<[1,?],si64>
    %int10 = torch.constant.int 10
    %int1_2134 = torch.constant.int 1
    %2262 = torch.aten.add.Scalar %2261, %int10, %int1_2134 : !torch.vtensor<[1,?],si64>, !torch.int, !torch.int -> !torch.vtensor<[1,?],si64>
    torch.bind_symbolic_shape %2262, [%548], affine_map<()[s0] -> (1, s0)> : !torch.vtensor<[1,?],si64>
    %int1_2135 = torch.constant.int 1
    %int32_2136 = torch.constant.int 32
    %int8_2137 = torch.constant.int 8
    %int128_2138 = torch.constant.int 128
    %2263 = torch.prim.ListConstruct %int1_2135, %748, %int32_2136, %int8_2137, %int128_2138 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2264 = torch.aten.view %2257, %2263 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2264, [%548], affine_map<()[s0] -> (1, s0, 32, 8, 128)> : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
    %int32_2139 = torch.constant.int 32
    %int8_2140 = torch.constant.int 8
    %int128_2141 = torch.constant.int 128
    %2265 = torch.prim.ListConstruct %748, %int32_2139, %int8_2140, %int128_2141 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2266 = torch.aten.view %2264, %2265 : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2266, [%548], affine_map<()[s0] -> (s0, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %2267 = torch.prim.ListConstruct %748 : (!torch.int) -> !torch.list<int>
    %2268 = torch.aten.view %2262, %2267 : !torch.vtensor<[1,?],si64>, !torch.list<int> -> !torch.vtensor<[?],si64>
    torch.bind_symbolic_shape %2268, [%548], affine_map<()[s0] -> (s0)> : !torch.vtensor<[?],si64>
    %int32_2142 = torch.constant.int 32
    %int2_2143 = torch.constant.int 2
    %int32_2144 = torch.constant.int 32
    %int8_2145 = torch.constant.int 8
    %int128_2146 = torch.constant.int 128
    %2269 = torch.prim.ListConstruct %739, %int32_2142, %int2_2143, %int32_2144, %int8_2145, %int128_2146 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2270 = torch.aten.view %1993, %2269 : !torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2270, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    %int32_2147 = torch.constant.int 32
    %2271 = torch.aten.mul.int %739, %int32_2147 : !torch.int, !torch.int -> !torch.int
    %int2_2148 = torch.constant.int 2
    %2272 = torch.aten.mul.int %2271, %int2_2148 : !torch.int, !torch.int -> !torch.int
    %int32_2149 = torch.constant.int 32
    %int8_2150 = torch.constant.int 8
    %int128_2151 = torch.constant.int 128
    %2273 = torch.prim.ListConstruct %2272, %int32_2149, %int8_2150, %int128_2151 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2274 = torch.aten.view %2270, %2273 : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2274, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %2275 = torch.prim.ListConstruct %2268 : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %false_2152 = torch.constant.bool false
    %2276 = torch.aten.index_put %2274, %2275, %2266, %false_2152 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2276, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %int32_2153 = torch.constant.int 32
    %int2_2154 = torch.constant.int 2
    %int32_2155 = torch.constant.int 32
    %int8_2156 = torch.constant.int 8
    %int128_2157 = torch.constant.int 128
    %2277 = torch.prim.ListConstruct %739, %int32_2153, %int2_2154, %int32_2155, %int8_2156, %int128_2157 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2278 = torch.aten.view %2276, %2277 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2278, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    %int2097152_2158 = torch.constant.int 2097152
    %2279 = torch.prim.ListConstruct %739, %int2097152_2158 : (!torch.int, !torch.int) -> !torch.list<int>
    %2280 = torch.aten.view %2278, %2279 : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2280, [%549], affine_map<()[s0] -> (s0, 2097152)> : !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    %int32_2159 = torch.constant.int 32
    %int2_2160 = torch.constant.int 2
    %int32_2161 = torch.constant.int 32
    %int8_2162 = torch.constant.int 8
    %int128_2163 = torch.constant.int 128
    %2281 = torch.prim.ListConstruct %739, %int32_2159, %int2_2160, %int32_2161, %int8_2162, %int128_2163 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2282 = torch.aten.view %2280, %2281 : !torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2282, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    %int32_2164 = torch.constant.int 32
    %int8_2165 = torch.constant.int 8
    %int128_2166 = torch.constant.int 128
    %2283 = torch.prim.ListConstruct %2272, %int32_2164, %int8_2165, %int128_2166 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2284 = torch.aten.view %2282, %2283 : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2284, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %int1_2167 = torch.constant.int 1
    %int32_2168 = torch.constant.int 32
    %int8_2169 = torch.constant.int 8
    %int128_2170 = torch.constant.int 128
    %2285 = torch.prim.ListConstruct %int1_2167, %748, %int32_2168, %int8_2169, %int128_2170 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2286 = torch.aten.view %2260, %2285 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2286, [%548], affine_map<()[s0] -> (1, s0, 32, 8, 128)> : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
    %int32_2171 = torch.constant.int 32
    %int8_2172 = torch.constant.int 8
    %int128_2173 = torch.constant.int 128
    %2287 = torch.prim.ListConstruct %748, %int32_2171, %int8_2172, %int128_2173 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2288 = torch.aten.view %2286, %2287 : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2288, [%548], affine_map<()[s0] -> (s0, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %int1_2174 = torch.constant.int 1
    %int1_2175 = torch.constant.int 1
    %2289 = torch.aten.add.Scalar %2262, %int1_2174, %int1_2175 : !torch.vtensor<[1,?],si64>, !torch.int, !torch.int -> !torch.vtensor<[1,?],si64>
    torch.bind_symbolic_shape %2289, [%548], affine_map<()[s0] -> (1, s0)> : !torch.vtensor<[1,?],si64>
    %2290 = torch.prim.ListConstruct %748 : (!torch.int) -> !torch.list<int>
    %2291 = torch.aten.view %2289, %2290 : !torch.vtensor<[1,?],si64>, !torch.list<int> -> !torch.vtensor<[?],si64>
    torch.bind_symbolic_shape %2291, [%548], affine_map<()[s0] -> (s0)> : !torch.vtensor<[?],si64>
    %2292 = torch.prim.ListConstruct %2291 : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %false_2176 = torch.constant.bool false
    %2293 = torch.aten.index_put %2284, %2292, %2288, %false_2176 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2293, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %int32_2177 = torch.constant.int 32
    %int2_2178 = torch.constant.int 2
    %int32_2179 = torch.constant.int 32
    %int8_2180 = torch.constant.int 8
    %int128_2181 = torch.constant.int 128
    %2294 = torch.prim.ListConstruct %739, %int32_2177, %int2_2178, %int32_2179, %int8_2180, %int128_2181 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2295 = torch.aten.view %2293, %2294 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2295, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    %int2097152_2182 = torch.constant.int 2097152
    %2296 = torch.prim.ListConstruct %739, %int2097152_2182 : (!torch.int, !torch.int) -> !torch.list<int>
    %2297 = torch.aten.view %2295, %2296 : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2297, [%549], affine_map<()[s0] -> (s0, 2097152)> : !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    %int-2_2183 = torch.constant.int -2
    %2298 = torch.aten.unsqueeze %2257, %int-2_2183 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2298, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 1, 128)> : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
    %int1_2184 = torch.constant.int 1
    %int8_2185 = torch.constant.int 8
    %int4_2186 = torch.constant.int 4
    %int128_2187 = torch.constant.int 128
    %2299 = torch.prim.ListConstruct %int1_2184, %2224, %int8_2185, %int4_2186, %int128_2187 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %false_2188 = torch.constant.bool false
    %2300 = torch.aten.expand %2298, %2299, %false_2188 : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2300, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    %int0_2189 = torch.constant.int 0
    %2301 = torch.aten.clone %2300, %int0_2189 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2301, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    %int1_2190 = torch.constant.int 1
    %int32_2191 = torch.constant.int 32
    %int128_2192 = torch.constant.int 128
    %2302 = torch.prim.ListConstruct %int1_2190, %2224, %int32_2191, %int128_2192 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2303 = torch.aten._unsafe_view %2301, %2302 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2303, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
    %int-2_2193 = torch.constant.int -2
    %2304 = torch.aten.unsqueeze %2260, %int-2_2193 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2304, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 1, 128)> : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
    %int1_2194 = torch.constant.int 1
    %2305 = torch.aten.size.int %2111, %int1_2194 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.int
    %int1_2195 = torch.constant.int 1
    %int8_2196 = torch.constant.int 8
    %int4_2197 = torch.constant.int 4
    %int128_2198 = torch.constant.int 128
    %2306 = torch.prim.ListConstruct %int1_2195, %2305, %int8_2196, %int4_2197, %int128_2198 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %false_2199 = torch.constant.bool false
    %2307 = torch.aten.expand %2304, %2306, %false_2199 : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2307, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    %int0_2200 = torch.constant.int 0
    %2308 = torch.aten.clone %2307, %int0_2200 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2308, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    %int1_2201 = torch.constant.int 1
    %int32_2202 = torch.constant.int 32
    %int128_2203 = torch.constant.int 128
    %2309 = torch.prim.ListConstruct %int1_2201, %2305, %int32_2202, %int128_2203 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2310 = torch.aten._unsafe_view %2308, %2309 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2310, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
    %int6_2204 = torch.constant.int 6
    %2311 = torch.prims.convert_element_type %2303, %int6_2204 : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,32,128],f32>
    torch.bind_symbolic_shape %2311, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
    %2312 = torch.aten.mul.Tensor %2311, %93 : !torch.vtensor<[1,?,32,128],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,32,128],f32>
    torch.bind_symbolic_shape %2312, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
    %int15_2205 = torch.constant.int 15
    %2313 = torch.prims.convert_element_type %2312, %int15_2205 : !torch.vtensor<[1,?,32,128],f32>, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
    torch.bind_symbolic_shape %2313, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
    %int6_2206 = torch.constant.int 6
    %2314 = torch.prims.convert_element_type %2310, %int6_2206 : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,32,128],f32>
    torch.bind_symbolic_shape %2314, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
    %2315 = torch.aten.mul.Tensor %2314, %93 : !torch.vtensor<[1,?,32,128],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,32,128],f32>
    torch.bind_symbolic_shape %2315, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
    %int15_2207 = torch.constant.int 15
    %2316 = torch.prims.convert_element_type %2315, %int15_2207 : !torch.vtensor<[1,?,32,128],f32>, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
    torch.bind_symbolic_shape %2316, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
    %int1_2208 = torch.constant.int 1
    %int2_2209 = torch.constant.int 2
    %2317 = torch.aten.transpose.int %2186, %int1_2208, %int2_2209 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
    torch.bind_symbolic_shape %2317, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %int1_2210 = torch.constant.int 1
    %int2_2211 = torch.constant.int 2
    %2318 = torch.aten.transpose.int %2313, %int1_2210, %int2_2211 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
    torch.bind_symbolic_shape %2318, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %int1_2212 = torch.constant.int 1
    %int2_2213 = torch.constant.int 2
    %2319 = torch.aten.transpose.int %2316, %int1_2212, %int2_2213 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
    torch.bind_symbolic_shape %2319, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %float0.000000e00_2214 = torch.constant.float 0.000000e+00
    %true_2215 = torch.constant.bool true
    %none_2216 = torch.constant.none
    %none_2217 = torch.constant.none
    %2320:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%2317, %2318, %2319, %float0.000000e00_2214, %true_2215, %none_2216, %none_2217) : (!torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,32,?,128],bf16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,32,?],f32>) 
    torch.bind_symbolic_shape %2320#0, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %int1_2218 = torch.constant.int 1
    %int2_2219 = torch.constant.int 2
    %2321 = torch.aten.transpose.int %2320#0, %int1_2218, %int2_2219 : !torch.vtensor<[1,32,?,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
    torch.bind_symbolic_shape %2321, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
    %int1_2220 = torch.constant.int 1
    %int4096_2221 = torch.constant.int 4096
    %2322 = torch.prim.ListConstruct %int1_2220, %2156, %int4096_2221 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2323 = torch.aten.view %2321, %2322 : !torch.vtensor<[1,?,32,128],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2323, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %2324 = torch.aten.div.Tensor %2323, %94 : !torch.vtensor<[1,?,4096],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2324, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %float-2.400000e02_2222 = torch.constant.float -2.400000e+02
    %float2.400000e02_2223 = torch.constant.float 2.400000e+02
    %2325 = torch.aten.clamp %2324, %float-2.400000e02_2222, %float2.400000e02_2223 : !torch.vtensor<[1,?,4096],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2325, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int26_2224 = torch.constant.int 26
    %2326 = torch.prims.convert_element_type %2325, %int26_2224 : !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2326, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_2225 = torch.constant.int -2
    %int-1_2226 = torch.constant.int -1
    %2327 = torch.aten.transpose.int %95, %int-2_2225, %int-1_2226 : !torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %int4096_2227 = torch.constant.int 4096
    %2328 = torch.prim.ListConstruct %2156, %int4096_2227 : (!torch.int, !torch.int) -> !torch.list<int>
    %2329 = torch.aten.view %2326, %2328 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2329, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %2330 = torch.aten.mm %2329, %2327 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2330, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %int1_2228 = torch.constant.int 1
    %int4096_2229 = torch.constant.int 4096
    %2331 = torch.prim.ListConstruct %int1_2228, %2156, %int4096_2229 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2332 = torch.aten.view %2330, %2331 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2332, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int15_2230 = torch.constant.int 15
    %2333 = torch.prims.convert_element_type %2332, %int15_2230 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2333, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int1_2231 = torch.constant.int 1
    %2334 = torch.aten.add.Tensor %2073, %2333, %int1_2231 : !torch.vtensor<[1,?,4096],bf16>, !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2334, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int6_2232 = torch.constant.int 6
    %2335 = torch.prims.convert_element_type %2334, %int6_2232 : !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %2335, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int2_2233 = torch.constant.int 2
    %2336 = torch.aten.pow.Tensor_Scalar %2335, %int2_2233 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %2336, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int-1_2234 = torch.constant.int -1
    %2337 = torch.prim.ListConstruct %int-1_2234 : (!torch.int) -> !torch.list<int>
    %true_2235 = torch.constant.bool true
    %none_2236 = torch.constant.none
    %2338 = torch.aten.mean.dim %2336, %2337, %true_2235, %none_2236 : !torch.vtensor<[1,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %2338, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %float1.000000e-05_2237 = torch.constant.float 1.000000e-05
    %int1_2238 = torch.constant.int 1
    %2339 = torch.aten.add.Scalar %2338, %float1.000000e-05_2237, %int1_2238 : !torch.vtensor<[1,?,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %2339, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %2340 = torch.aten.rsqrt %2339 : !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %2340, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %2341 = torch.aten.mul.Tensor %2335, %2340 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %2341, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int15_2239 = torch.constant.int 15
    %2342 = torch.prims.convert_element_type %2341, %int15_2239 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2342, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %2343 = torch.aten.mul.Tensor %96, %2342 : !torch.vtensor<[4096],bf16>, !torch.vtensor<[1,?,4096],bf16> -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2343, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %2344 = torch.aten.div.Tensor %2343, %97 : !torch.vtensor<[1,?,4096],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2344, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %float-2.400000e02_2240 = torch.constant.float -2.400000e+02
    %float2.400000e02_2241 = torch.constant.float 2.400000e+02
    %2345 = torch.aten.clamp %2344, %float-2.400000e02_2240, %float2.400000e02_2241 : !torch.vtensor<[1,?,4096],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2345, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int26_2242 = torch.constant.int 26
    %2346 = torch.prims.convert_element_type %2345, %int26_2242 : !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2346, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_2243 = torch.constant.int -2
    %int-1_2244 = torch.constant.int -1
    %2347 = torch.aten.transpose.int %98, %int-2_2243, %int-1_2244 : !torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %int4096_2245 = torch.constant.int 4096
    %2348 = torch.prim.ListConstruct %564, %int4096_2245 : (!torch.int, !torch.int) -> !torch.list<int>
    %2349 = torch.aten.view %2346, %2348 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2349, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %2350 = torch.aten.mm %2349, %2347 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,14336],f8E4M3FNUZ> -> !torch.vtensor<[?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2350, [%548], affine_map<()[s0] -> (s0 * 32, 14336)> : !torch.vtensor<[?,14336],f8E4M3FNUZ>
    %int1_2246 = torch.constant.int 1
    %int14336_2247 = torch.constant.int 14336
    %2351 = torch.prim.ListConstruct %int1_2246, %564, %int14336_2247 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2352 = torch.aten.view %2350, %2351 : !torch.vtensor<[?,14336],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2352, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    %int15_2248 = torch.constant.int 15
    %2353 = torch.prims.convert_element_type %2352, %int15_2248 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %2353, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %2354 = torch.aten.silu %2353 : !torch.vtensor<[1,?,14336],bf16> -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %2354, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %2355 = torch.aten.div.Tensor %2343, %99 : !torch.vtensor<[1,?,4096],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2355, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %float-2.400000e02_2249 = torch.constant.float -2.400000e+02
    %float2.400000e02_2250 = torch.constant.float 2.400000e+02
    %2356 = torch.aten.clamp %2355, %float-2.400000e02_2249, %float2.400000e02_2250 : !torch.vtensor<[1,?,4096],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2356, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int26_2251 = torch.constant.int 26
    %2357 = torch.prims.convert_element_type %2356, %int26_2251 : !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2357, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_2252 = torch.constant.int -2
    %int-1_2253 = torch.constant.int -1
    %2358 = torch.aten.transpose.int %100, %int-2_2252, %int-1_2253 : !torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %int4096_2254 = torch.constant.int 4096
    %2359 = torch.prim.ListConstruct %564, %int4096_2254 : (!torch.int, !torch.int) -> !torch.list<int>
    %2360 = torch.aten.view %2357, %2359 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2360, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %2361 = torch.aten.mm %2360, %2358 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,14336],f8E4M3FNUZ> -> !torch.vtensor<[?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2361, [%548], affine_map<()[s0] -> (s0 * 32, 14336)> : !torch.vtensor<[?,14336],f8E4M3FNUZ>
    %int1_2255 = torch.constant.int 1
    %int14336_2256 = torch.constant.int 14336
    %2362 = torch.prim.ListConstruct %int1_2255, %564, %int14336_2256 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2363 = torch.aten.view %2361, %2362 : !torch.vtensor<[?,14336],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2363, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    %int15_2257 = torch.constant.int 15
    %2364 = torch.prims.convert_element_type %2363, %int15_2257 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %2364, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %2365 = torch.aten.mul.Tensor %2354, %2364 : !torch.vtensor<[1,?,14336],bf16>, !torch.vtensor<[1,?,14336],bf16> -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %2365, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %2366 = torch.aten.div.Tensor %2365, %101 : !torch.vtensor<[1,?,14336],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %2366, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %float-2.400000e02_2258 = torch.constant.float -2.400000e+02
    %float2.400000e02_2259 = torch.constant.float 2.400000e+02
    %2367 = torch.aten.clamp %2366, %float-2.400000e02_2258, %float2.400000e02_2259 : !torch.vtensor<[1,?,14336],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %2367, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %int26_2260 = torch.constant.int 26
    %2368 = torch.prims.convert_element_type %2367, %int26_2260 : !torch.vtensor<[1,?,14336],bf16>, !torch.int -> !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2368, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    %int-2_2261 = torch.constant.int -2
    %int-1_2262 = torch.constant.int -1
    %2369 = torch.aten.transpose.int %102, %int-2_2261, %int-1_2262 : !torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %int1_2263 = torch.constant.int 1
    %2370 = torch.aten.size.int %2352, %int1_2263 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.int -> !torch.int
    %int14336_2264 = torch.constant.int 14336
    %2371 = torch.prim.ListConstruct %2370, %int14336_2264 : (!torch.int, !torch.int) -> !torch.list<int>
    %2372 = torch.aten.view %2368, %2371 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2372, [%548], affine_map<()[s0] -> (s0 * 32, 14336)> : !torch.vtensor<[?,14336],f8E4M3FNUZ>
    %2373 = torch.aten.mm %2372, %2369 : !torch.vtensor<[?,14336],f8E4M3FNUZ>, !torch.vtensor<[14336,4096],f8E4M3FNUZ> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2373, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %int1_2265 = torch.constant.int 1
    %int4096_2266 = torch.constant.int 4096
    %2374 = torch.prim.ListConstruct %int1_2265, %2370, %int4096_2266 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2375 = torch.aten.view %2373, %2374 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2375, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int15_2267 = torch.constant.int 15
    %2376 = torch.prims.convert_element_type %2375, %int15_2267 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2376, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int1_2268 = torch.constant.int 1
    %2377 = torch.aten.add.Tensor %2334, %2376, %int1_2268 : !torch.vtensor<[1,?,4096],bf16>, !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2377, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int6_2269 = torch.constant.int 6
    %2378 = torch.prims.convert_element_type %2377, %int6_2269 : !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %2378, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int2_2270 = torch.constant.int 2
    %2379 = torch.aten.pow.Tensor_Scalar %2378, %int2_2270 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %2379, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int-1_2271 = torch.constant.int -1
    %2380 = torch.prim.ListConstruct %int-1_2271 : (!torch.int) -> !torch.list<int>
    %true_2272 = torch.constant.bool true
    %none_2273 = torch.constant.none
    %2381 = torch.aten.mean.dim %2379, %2380, %true_2272, %none_2273 : !torch.vtensor<[1,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %2381, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %float1.000000e-05_2274 = torch.constant.float 1.000000e-05
    %int1_2275 = torch.constant.int 1
    %2382 = torch.aten.add.Scalar %2381, %float1.000000e-05_2274, %int1_2275 : !torch.vtensor<[1,?,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %2382, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %2383 = torch.aten.rsqrt %2382 : !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %2383, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %2384 = torch.aten.mul.Tensor %2378, %2383 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %2384, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int15_2276 = torch.constant.int 15
    %2385 = torch.prims.convert_element_type %2384, %int15_2276 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2385, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %2386 = torch.aten.mul.Tensor %103, %2385 : !torch.vtensor<[4096],bf16>, !torch.vtensor<[1,?,4096],bf16> -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2386, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %2387 = torch.aten.div.Tensor %2386, %104 : !torch.vtensor<[1,?,4096],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2387, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %float-2.400000e02_2277 = torch.constant.float -2.400000e+02
    %float2.400000e02_2278 = torch.constant.float 2.400000e+02
    %2388 = torch.aten.clamp %2387, %float-2.400000e02_2277, %float2.400000e02_2278 : !torch.vtensor<[1,?,4096],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2388, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int26_2279 = torch.constant.int 26
    %2389 = torch.prims.convert_element_type %2388, %int26_2279 : !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2389, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_2280 = torch.constant.int -2
    %int-1_2281 = torch.constant.int -1
    %2390 = torch.aten.transpose.int %105, %int-2_2280, %int-1_2281 : !torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %int4096_2282 = torch.constant.int 4096
    %2391 = torch.prim.ListConstruct %564, %int4096_2282 : (!torch.int, !torch.int) -> !torch.list<int>
    %2392 = torch.aten.view %2389, %2391 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2392, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %2393 = torch.aten.mm %2392, %2390 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2393, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %int1_2283 = torch.constant.int 1
    %int4096_2284 = torch.constant.int 4096
    %2394 = torch.prim.ListConstruct %int1_2283, %564, %int4096_2284 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2395 = torch.aten.view %2393, %2394 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2395, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int15_2285 = torch.constant.int 15
    %2396 = torch.prims.convert_element_type %2395, %int15_2285 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2396, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %2397 = torch.aten.div.Tensor %2386, %106 : !torch.vtensor<[1,?,4096],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2397, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %float-2.400000e02_2286 = torch.constant.float -2.400000e+02
    %float2.400000e02_2287 = torch.constant.float 2.400000e+02
    %2398 = torch.aten.clamp %2397, %float-2.400000e02_2286, %float2.400000e02_2287 : !torch.vtensor<[1,?,4096],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2398, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int26_2288 = torch.constant.int 26
    %2399 = torch.prims.convert_element_type %2398, %int26_2288 : !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2399, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_2289 = torch.constant.int -2
    %int-1_2290 = torch.constant.int -1
    %2400 = torch.aten.transpose.int %107, %int-2_2289, %int-1_2290 : !torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,1024],f8E4M3FNUZ>
    %int4096_2291 = torch.constant.int 4096
    %2401 = torch.prim.ListConstruct %564, %int4096_2291 : (!torch.int, !torch.int) -> !torch.list<int>
    %2402 = torch.aten.view %2399, %2401 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2402, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %2403 = torch.aten.mm %2402, %2400 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,1024],f8E4M3FNUZ> -> !torch.vtensor<[?,1024],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2403, [%548], affine_map<()[s0] -> (s0 * 32, 1024)> : !torch.vtensor<[?,1024],f8E4M3FNUZ>
    %int1_2292 = torch.constant.int 1
    %int1024_2293 = torch.constant.int 1024
    %2404 = torch.prim.ListConstruct %int1_2292, %564, %int1024_2293 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2405 = torch.aten.view %2403, %2404 : !torch.vtensor<[?,1024],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2405, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
    %int15_2294 = torch.constant.int 15
    %2406 = torch.prims.convert_element_type %2405, %int15_2294 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,1024],bf16>
    torch.bind_symbolic_shape %2406, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],bf16>
    %2407 = torch.aten.div.Tensor %2386, %108 : !torch.vtensor<[1,?,4096],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2407, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %float-2.400000e02_2295 = torch.constant.float -2.400000e+02
    %float2.400000e02_2296 = torch.constant.float 2.400000e+02
    %2408 = torch.aten.clamp %2407, %float-2.400000e02_2295, %float2.400000e02_2296 : !torch.vtensor<[1,?,4096],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2408, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int26_2297 = torch.constant.int 26
    %2409 = torch.prims.convert_element_type %2408, %int26_2297 : !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2409, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_2298 = torch.constant.int -2
    %int-1_2299 = torch.constant.int -1
    %2410 = torch.aten.transpose.int %109, %int-2_2298, %int-1_2299 : !torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,1024],f8E4M3FNUZ>
    %int4096_2300 = torch.constant.int 4096
    %2411 = torch.prim.ListConstruct %564, %int4096_2300 : (!torch.int, !torch.int) -> !torch.list<int>
    %2412 = torch.aten.view %2409, %2411 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2412, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %2413 = torch.aten.mm %2412, %2410 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,1024],f8E4M3FNUZ> -> !torch.vtensor<[?,1024],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2413, [%548], affine_map<()[s0] -> (s0 * 32, 1024)> : !torch.vtensor<[?,1024],f8E4M3FNUZ>
    %int1_2301 = torch.constant.int 1
    %int1024_2302 = torch.constant.int 1024
    %2414 = torch.prim.ListConstruct %int1_2301, %564, %int1024_2302 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2415 = torch.aten.view %2413, %2414 : !torch.vtensor<[?,1024],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2415, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
    %int15_2303 = torch.constant.int 15
    %2416 = torch.prims.convert_element_type %2415, %int15_2303 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,1024],bf16>
    torch.bind_symbolic_shape %2416, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],bf16>
    %int1_2304 = torch.constant.int 1
    %int32_2305 = torch.constant.int 32
    %int128_2306 = torch.constant.int 128
    %2417 = torch.prim.ListConstruct %int1_2304, %564, %int32_2305, %int128_2306 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2418 = torch.aten.view %2396, %2417 : !torch.vtensor<[1,?,4096],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,32,128],bf16>
    torch.bind_symbolic_shape %2418, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
    %int1_2307 = torch.constant.int 1
    %int8_2308 = torch.constant.int 8
    %int128_2309 = torch.constant.int 128
    %2419 = torch.prim.ListConstruct %int1_2307, %564, %int8_2308, %int128_2309 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2420 = torch.aten.view %2406, %2419 : !torch.vtensor<[1,?,1024],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %2420, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %int1_2310 = torch.constant.int 1
    %int8_2311 = torch.constant.int 8
    %int128_2312 = torch.constant.int 128
    %2421 = torch.prim.ListConstruct %int1_2310, %564, %int8_2311, %int128_2312 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2422 = torch.aten.view %2416, %2421 : !torch.vtensor<[1,?,1024],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %2422, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %int131072_2313 = torch.constant.int 131072
    %none_2314 = torch.constant.none
    %none_2315 = torch.constant.none
    %cpu_2316 = torch.constant.device "cpu"
    %false_2317 = torch.constant.bool false
    %2423 = torch.aten.arange %int131072_2313, %none_2314, %none_2315, %cpu_2316, %false_2317 : !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[131072],si64>
    %int0_2318 = torch.constant.int 0
    %int128_2319 = torch.constant.int 128
    %int2_2320 = torch.constant.int 2
    %int4_2321 = torch.constant.int 4
    %none_2322 = torch.constant.none
    %cpu_2323 = torch.constant.device "cpu"
    %false_2324 = torch.constant.bool false
    %2424 = torch.aten.arange.start_step %int0_2318, %int128_2319, %int2_2320, %int4_2321, %none_2322, %cpu_2323, %false_2324 : !torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[64],si64>
    %int6_2325 = torch.constant.int 6
    %2425 = torch.prims.convert_element_type %2424, %int6_2325 : !torch.vtensor<[64],si64>, !torch.int -> !torch.vtensor<[64],f32>
    %int128_2326 = torch.constant.int 128
    %2426 = torch.aten.div.Scalar %2425, %int128_2326 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %float5.000000e05_2327 = torch.constant.float 5.000000e+05
    %2427 = torch.aten.pow.Scalar %float5.000000e05_2327, %2426 : !torch.float, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %2428 = torch.aten.reciprocal %2427 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %float1.000000e00_2328 = torch.constant.float 1.000000e+00
    %2429 = torch.aten.mul.Scalar %2428, %float1.000000e00_2328 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],f32>
    %2430 = torch.aten.reciprocal %2429 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %float6.283190e00_2329 = torch.constant.float 6.2831853071795862
    %2431 = torch.aten.mul.Scalar %2430, %float6.283190e00_2329 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],f32>
    %float8.192000e03_2330 = torch.constant.float 8.192000e+03
    %2432 = torch.aten.gt.Scalar %2431, %float8.192000e03_2330 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1>
    %int8_2331 = torch.constant.int 8
    %2433 = torch.aten.div.Scalar %2429, %int8_2331 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %2434 = torch.aten.where.self %2432, %2433, %2429 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %2435 = torch.aten.reciprocal %2431 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %int8192_2332 = torch.constant.int 8192
    %2436 = torch.aten.mul.Scalar %2435, %int8192_2332 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %int1_2333 = torch.constant.int 1
    %int1_2334 = torch.constant.int 1
    %2437 = torch.aten.sub.Scalar %2436, %int1_2333, %int1_2334 : !torch.vtensor<[64],f32>, !torch.int, !torch.int -> !torch.vtensor<[64],f32>
    %int3_2335 = torch.constant.int 3
    %2438 = torch.aten.div.Scalar %2437, %int3_2335 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %int1_2336 = torch.constant.int 1
    %int1_2337 = torch.constant.int 1
    %2439 = torch.aten.rsub.Scalar %2438, %int1_2336, %int1_2337 : !torch.vtensor<[64],f32>, !torch.int, !torch.int -> !torch.vtensor<[64],f32>
    %2440 = torch.aten.mul.Tensor %2439, %2434 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %int8_2338 = torch.constant.int 8
    %2441 = torch.aten.div.Scalar %2440, %int8_2338 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %2442 = torch.aten.mul.Tensor %2438, %2434 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %int1_2339 = torch.constant.int 1
    %2443 = torch.aten.add.Tensor %2441, %2442, %int1_2339 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %float2.048000e03_2340 = torch.constant.float 2.048000e+03
    %2444 = torch.aten.lt.Scalar %2431, %float2.048000e03_2340 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1>
    %2445 = torch.aten.bitwise_not %2444 : !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1>
    %float8.192000e03_2341 = torch.constant.float 8.192000e+03
    %2446 = torch.aten.gt.Scalar %2431, %float8.192000e03_2341 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1>
    %2447 = torch.aten.bitwise_not %2446 : !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1>
    %2448 = torch.aten.mul.Tensor %2445, %2447 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1>
    %2449 = torch.aten.where.self %2448, %2443, %2434 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %2450 = torch.prim.ListConstruct %2449, %2449 : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor>
    %int-1_2342 = torch.constant.int -1
    %2451 = torch.aten.cat %2450, %int-1_2342 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[128],f32>
    %int6_2343 = torch.constant.int 6
    %2452 = torch.prims.convert_element_type %2423, %int6_2343 : !torch.vtensor<[131072],si64>, !torch.int -> !torch.vtensor<[131072],f32>
    %int131072_2344 = torch.constant.int 131072
    %int1_2345 = torch.constant.int 1
    %2453 = torch.prim.ListConstruct %int131072_2344, %int1_2345 : (!torch.int, !torch.int) -> !torch.list<int>
    %2454 = torch.aten.view %2452, %2453 : !torch.vtensor<[131072],f32>, !torch.list<int> -> !torch.vtensor<[131072,1],f32>
    %2455 = torch.aten.mul.Tensor %2454, %2451 : !torch.vtensor<[131072,1],f32>, !torch.vtensor<[128],f32> -> !torch.vtensor<[131072,128],f32>
    %2456 = torch.aten.cos %2455 : !torch.vtensor<[131072,128],f32> -> !torch.vtensor<[131072,128],f32>
    %int15_2346 = torch.constant.int 15
    %2457 = torch.prims.convert_element_type %2456, %int15_2346 : !torch.vtensor<[131072,128],f32>, !torch.int -> !torch.vtensor<[131072,128],bf16>
    %2458 = torch.aten.sin %2455 : !torch.vtensor<[131072,128],f32> -> !torch.vtensor<[131072,128],f32>
    %int15_2347 = torch.constant.int 15
    %2459 = torch.prims.convert_element_type %2458, %int15_2347 : !torch.vtensor<[131072,128],f32>, !torch.int -> !torch.vtensor<[131072,128],bf16>
    %int1_2348 = torch.constant.int 1
    %2460 = torch.aten.size.int %2395, %int1_2348 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.int
    %int0_2349 = torch.constant.int 0
    %2461 = torch.aten.add.int %int0_2349, %2460 : !torch.int, !torch.int -> !torch.int
    %int0_2350 = torch.constant.int 0
    %int0_2351 = torch.constant.int 0
    %int1_2352 = torch.constant.int 1
    %2462 = torch.aten.slice.Tensor %2457, %int0_2350, %int0_2351, %2461, %int1_2352 : !torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %2462, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int1_2353 = torch.constant.int 1
    %int0_2354 = torch.constant.int 0
    %int9223372036854775807_2355 = torch.constant.int 9223372036854775807
    %int1_2356 = torch.constant.int 1
    %2463 = torch.aten.slice.Tensor %2462, %int1_2353, %int0_2354, %int9223372036854775807_2355, %int1_2356 : !torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %2463, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int0_2357 = torch.constant.int 0
    %2464 = torch.aten.add.int %int0_2357, %2460 : !torch.int, !torch.int -> !torch.int
    %int0_2358 = torch.constant.int 0
    %int0_2359 = torch.constant.int 0
    %int1_2360 = torch.constant.int 1
    %2465 = torch.aten.slice.Tensor %2459, %int0_2358, %int0_2359, %2464, %int1_2360 : !torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %2465, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int1_2361 = torch.constant.int 1
    %int0_2362 = torch.constant.int 0
    %int9223372036854775807_2363 = torch.constant.int 9223372036854775807
    %int1_2364 = torch.constant.int 1
    %2466 = torch.aten.slice.Tensor %2465, %int1_2361, %int0_2362, %int9223372036854775807_2363, %int1_2364 : !torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %2466, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int0_2365 = torch.constant.int 0
    %2467 = torch.aten.unsqueeze %2463, %int0_2365 : !torch.vtensor<[?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %2467, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int1_2366 = torch.constant.int 1
    %int0_2367 = torch.constant.int 0
    %int9223372036854775807_2368 = torch.constant.int 9223372036854775807
    %int1_2369 = torch.constant.int 1
    %2468 = torch.aten.slice.Tensor %2467, %int1_2366, %int0_2367, %int9223372036854775807_2368, %int1_2369 : !torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %2468, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int2_2370 = torch.constant.int 2
    %2469 = torch.aten.unsqueeze %2468, %int2_2370 : !torch.vtensor<[1,?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %2469, [%548], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int3_2371 = torch.constant.int 3
    %int0_2372 = torch.constant.int 0
    %int9223372036854775807_2373 = torch.constant.int 9223372036854775807
    %int1_2374 = torch.constant.int 1
    %2470 = torch.aten.slice.Tensor %2469, %int3_2371, %int0_2372, %int9223372036854775807_2373, %int1_2374 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %2470, [%548], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int0_2375 = torch.constant.int 0
    %2471 = torch.aten.unsqueeze %2466, %int0_2375 : !torch.vtensor<[?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %2471, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int1_2376 = torch.constant.int 1
    %int0_2377 = torch.constant.int 0
    %int9223372036854775807_2378 = torch.constant.int 9223372036854775807
    %int1_2379 = torch.constant.int 1
    %2472 = torch.aten.slice.Tensor %2471, %int1_2376, %int0_2377, %int9223372036854775807_2378, %int1_2379 : !torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %2472, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int2_2380 = torch.constant.int 2
    %2473 = torch.aten.unsqueeze %2472, %int2_2380 : !torch.vtensor<[1,?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %2473, [%548], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int3_2381 = torch.constant.int 3
    %int0_2382 = torch.constant.int 0
    %int9223372036854775807_2383 = torch.constant.int 9223372036854775807
    %int1_2384 = torch.constant.int 1
    %2474 = torch.aten.slice.Tensor %2473, %int3_2381, %int0_2382, %int9223372036854775807_2383, %int1_2384 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %2474, [%548], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int1_2385 = torch.constant.int 1
    %int2_2386 = torch.constant.int 2
    %2475 = torch.aten.transpose.int %2470, %int1_2385, %int2_2386 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,?,128],bf16>
    torch.bind_symbolic_shape %2475, [%548], affine_map<()[s0] -> (1, 1, s0 * 32, 128)> : !torch.vtensor<[1,1,?,128],bf16>
    %int1_2387 = torch.constant.int 1
    %int1_2388 = torch.constant.int 1
    %int1_2389 = torch.constant.int 1
    %int1_2390 = torch.constant.int 1
    %2476 = torch.prim.ListConstruct %int1_2387, %int1_2388, %int1_2389, %int1_2390 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2477 = torch.aten.repeat %2475, %2476 : !torch.vtensor<[1,1,?,128],bf16>, !torch.list<int> -> !torch.vtensor<[1,1,?,128],bf16>
    torch.bind_symbolic_shape %2477, [%548], affine_map<()[s0] -> (1, 1, s0 * 32, 128)> : !torch.vtensor<[1,1,?,128],bf16>
    %int1_2391 = torch.constant.int 1
    %int2_2392 = torch.constant.int 2
    %2478 = torch.aten.transpose.int %2474, %int1_2391, %int2_2392 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,?,128],bf16>
    torch.bind_symbolic_shape %2478, [%548], affine_map<()[s0] -> (1, 1, s0 * 32, 128)> : !torch.vtensor<[1,1,?,128],bf16>
    %int1_2393 = torch.constant.int 1
    %int2_2394 = torch.constant.int 2
    %2479 = torch.aten.transpose.int %2418, %int1_2393, %int2_2394 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
    torch.bind_symbolic_shape %2479, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %int1_2395 = torch.constant.int 1
    %int1_2396 = torch.constant.int 1
    %int1_2397 = torch.constant.int 1
    %int1_2398 = torch.constant.int 1
    %2480 = torch.prim.ListConstruct %int1_2395, %int1_2396, %int1_2397, %int1_2398 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2481 = torch.aten.repeat %2478, %2480 : !torch.vtensor<[1,1,?,128],bf16>, !torch.list<int> -> !torch.vtensor<[1,1,?,128],bf16>
    torch.bind_symbolic_shape %2481, [%548], affine_map<()[s0] -> (1, 1, s0 * 32, 128)> : !torch.vtensor<[1,1,?,128],bf16>
    %2482 = torch.aten.mul.Tensor %2479, %2477 : !torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,1,?,128],bf16> -> !torch.vtensor<[1,32,?,128],bf16>
    torch.bind_symbolic_shape %2482, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %int3_2399 = torch.constant.int 3
    %int0_2400 = torch.constant.int 0
    %int64_2401 = torch.constant.int 64
    %int1_2402 = torch.constant.int 1
    %2483 = torch.aten.slice.Tensor %2479, %int3_2399, %int0_2400, %int64_2401, %int1_2402 : !torch.vtensor<[1,32,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,64],bf16>
    torch.bind_symbolic_shape %2483, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 64)> : !torch.vtensor<[1,32,?,64],bf16>
    %int3_2403 = torch.constant.int 3
    %int64_2404 = torch.constant.int 64
    %int9223372036854775807_2405 = torch.constant.int 9223372036854775807
    %int1_2406 = torch.constant.int 1
    %2484 = torch.aten.slice.Tensor %2479, %int3_2403, %int64_2404, %int9223372036854775807_2405, %int1_2406 : !torch.vtensor<[1,32,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,64],bf16>
    torch.bind_symbolic_shape %2484, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 64)> : !torch.vtensor<[1,32,?,64],bf16>
    %2485 = torch.aten.neg %2484 : !torch.vtensor<[1,32,?,64],bf16> -> !torch.vtensor<[1,32,?,64],bf16>
    torch.bind_symbolic_shape %2485, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 64)> : !torch.vtensor<[1,32,?,64],bf16>
    %2486 = torch.prim.ListConstruct %2485, %2483 : (!torch.vtensor<[1,32,?,64],bf16>, !torch.vtensor<[1,32,?,64],bf16>) -> !torch.list<vtensor>
    %int-1_2407 = torch.constant.int -1
    %2487 = torch.aten.cat %2486, %int-1_2407 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
    torch.bind_symbolic_shape %2487, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %2488 = torch.aten.mul.Tensor %2487, %2481 : !torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,1,?,128],bf16> -> !torch.vtensor<[1,32,?,128],bf16>
    torch.bind_symbolic_shape %2488, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %int1_2408 = torch.constant.int 1
    %2489 = torch.aten.add.Tensor %2482, %2488, %int1_2408 : !torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,32,?,128],bf16>, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
    torch.bind_symbolic_shape %2489, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %int1_2409 = torch.constant.int 1
    %int2_2410 = torch.constant.int 2
    %2490 = torch.aten.transpose.int %2489, %int1_2409, %int2_2410 : !torch.vtensor<[1,32,?,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
    torch.bind_symbolic_shape %2490, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
    %int131072_2411 = torch.constant.int 131072
    %none_2412 = torch.constant.none
    %none_2413 = torch.constant.none
    %cpu_2414 = torch.constant.device "cpu"
    %false_2415 = torch.constant.bool false
    %2491 = torch.aten.arange %int131072_2411, %none_2412, %none_2413, %cpu_2414, %false_2415 : !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[131072],si64>
    %int0_2416 = torch.constant.int 0
    %int128_2417 = torch.constant.int 128
    %int2_2418 = torch.constant.int 2
    %int4_2419 = torch.constant.int 4
    %none_2420 = torch.constant.none
    %cpu_2421 = torch.constant.device "cpu"
    %false_2422 = torch.constant.bool false
    %2492 = torch.aten.arange.start_step %int0_2416, %int128_2417, %int2_2418, %int4_2419, %none_2420, %cpu_2421, %false_2422 : !torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[64],si64>
    %int6_2423 = torch.constant.int 6
    %2493 = torch.prims.convert_element_type %2492, %int6_2423 : !torch.vtensor<[64],si64>, !torch.int -> !torch.vtensor<[64],f32>
    %int128_2424 = torch.constant.int 128
    %2494 = torch.aten.div.Scalar %2493, %int128_2424 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %float5.000000e05_2425 = torch.constant.float 5.000000e+05
    %2495 = torch.aten.pow.Scalar %float5.000000e05_2425, %2494 : !torch.float, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %2496 = torch.aten.reciprocal %2495 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %float1.000000e00_2426 = torch.constant.float 1.000000e+00
    %2497 = torch.aten.mul.Scalar %2496, %float1.000000e00_2426 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],f32>
    %2498 = torch.aten.reciprocal %2497 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %float6.283190e00_2427 = torch.constant.float 6.2831853071795862
    %2499 = torch.aten.mul.Scalar %2498, %float6.283190e00_2427 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],f32>
    %float8.192000e03_2428 = torch.constant.float 8.192000e+03
    %2500 = torch.aten.gt.Scalar %2499, %float8.192000e03_2428 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1>
    %int8_2429 = torch.constant.int 8
    %2501 = torch.aten.div.Scalar %2497, %int8_2429 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %2502 = torch.aten.where.self %2500, %2501, %2497 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %2503 = torch.aten.reciprocal %2499 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %int8192_2430 = torch.constant.int 8192
    %2504 = torch.aten.mul.Scalar %2503, %int8192_2430 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %int1_2431 = torch.constant.int 1
    %int1_2432 = torch.constant.int 1
    %2505 = torch.aten.sub.Scalar %2504, %int1_2431, %int1_2432 : !torch.vtensor<[64],f32>, !torch.int, !torch.int -> !torch.vtensor<[64],f32>
    %int3_2433 = torch.constant.int 3
    %2506 = torch.aten.div.Scalar %2505, %int3_2433 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %int1_2434 = torch.constant.int 1
    %int1_2435 = torch.constant.int 1
    %2507 = torch.aten.rsub.Scalar %2506, %int1_2434, %int1_2435 : !torch.vtensor<[64],f32>, !torch.int, !torch.int -> !torch.vtensor<[64],f32>
    %2508 = torch.aten.mul.Tensor %2507, %2502 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %int8_2436 = torch.constant.int 8
    %2509 = torch.aten.div.Scalar %2508, %int8_2436 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %2510 = torch.aten.mul.Tensor %2506, %2502 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %int1_2437 = torch.constant.int 1
    %2511 = torch.aten.add.Tensor %2509, %2510, %int1_2437 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %float2.048000e03_2438 = torch.constant.float 2.048000e+03
    %2512 = torch.aten.lt.Scalar %2499, %float2.048000e03_2438 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1>
    %2513 = torch.aten.bitwise_not %2512 : !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1>
    %float8.192000e03_2439 = torch.constant.float 8.192000e+03
    %2514 = torch.aten.gt.Scalar %2499, %float8.192000e03_2439 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1>
    %2515 = torch.aten.bitwise_not %2514 : !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1>
    %2516 = torch.aten.mul.Tensor %2513, %2515 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1>
    %2517 = torch.aten.where.self %2516, %2511, %2502 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %2518 = torch.prim.ListConstruct %2517, %2517 : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor>
    %int-1_2440 = torch.constant.int -1
    %2519 = torch.aten.cat %2518, %int-1_2440 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[128],f32>
    %int6_2441 = torch.constant.int 6
    %2520 = torch.prims.convert_element_type %2491, %int6_2441 : !torch.vtensor<[131072],si64>, !torch.int -> !torch.vtensor<[131072],f32>
    %int131072_2442 = torch.constant.int 131072
    %int1_2443 = torch.constant.int 1
    %2521 = torch.prim.ListConstruct %int131072_2442, %int1_2443 : (!torch.int, !torch.int) -> !torch.list<int>
    %2522 = torch.aten.view %2520, %2521 : !torch.vtensor<[131072],f32>, !torch.list<int> -> !torch.vtensor<[131072,1],f32>
    %2523 = torch.aten.mul.Tensor %2522, %2519 : !torch.vtensor<[131072,1],f32>, !torch.vtensor<[128],f32> -> !torch.vtensor<[131072,128],f32>
    %2524 = torch.aten.cos %2523 : !torch.vtensor<[131072,128],f32> -> !torch.vtensor<[131072,128],f32>
    %int15_2444 = torch.constant.int 15
    %2525 = torch.prims.convert_element_type %2524, %int15_2444 : !torch.vtensor<[131072,128],f32>, !torch.int -> !torch.vtensor<[131072,128],bf16>
    %2526 = torch.aten.sin %2523 : !torch.vtensor<[131072,128],f32> -> !torch.vtensor<[131072,128],f32>
    %int15_2445 = torch.constant.int 15
    %2527 = torch.prims.convert_element_type %2526, %int15_2445 : !torch.vtensor<[131072,128],f32>, !torch.int -> !torch.vtensor<[131072,128],bf16>
    %int1_2446 = torch.constant.int 1
    %2528 = torch.aten.size.int %2405, %int1_2446 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.int
    %int0_2447 = torch.constant.int 0
    %2529 = torch.aten.add.int %int0_2447, %2528 : !torch.int, !torch.int -> !torch.int
    %int0_2448 = torch.constant.int 0
    %int0_2449 = torch.constant.int 0
    %int1_2450 = torch.constant.int 1
    %2530 = torch.aten.slice.Tensor %2525, %int0_2448, %int0_2449, %2529, %int1_2450 : !torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %2530, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int1_2451 = torch.constant.int 1
    %int0_2452 = torch.constant.int 0
    %int9223372036854775807_2453 = torch.constant.int 9223372036854775807
    %int1_2454 = torch.constant.int 1
    %2531 = torch.aten.slice.Tensor %2530, %int1_2451, %int0_2452, %int9223372036854775807_2453, %int1_2454 : !torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %2531, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int0_2455 = torch.constant.int 0
    %2532 = torch.aten.add.int %int0_2455, %2528 : !torch.int, !torch.int -> !torch.int
    %int0_2456 = torch.constant.int 0
    %int0_2457 = torch.constant.int 0
    %int1_2458 = torch.constant.int 1
    %2533 = torch.aten.slice.Tensor %2527, %int0_2456, %int0_2457, %2532, %int1_2458 : !torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %2533, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int1_2459 = torch.constant.int 1
    %int0_2460 = torch.constant.int 0
    %int9223372036854775807_2461 = torch.constant.int 9223372036854775807
    %int1_2462 = torch.constant.int 1
    %2534 = torch.aten.slice.Tensor %2533, %int1_2459, %int0_2460, %int9223372036854775807_2461, %int1_2462 : !torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %2534, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int0_2463 = torch.constant.int 0
    %2535 = torch.aten.unsqueeze %2531, %int0_2463 : !torch.vtensor<[?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %2535, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int1_2464 = torch.constant.int 1
    %int0_2465 = torch.constant.int 0
    %int9223372036854775807_2466 = torch.constant.int 9223372036854775807
    %int1_2467 = torch.constant.int 1
    %2536 = torch.aten.slice.Tensor %2535, %int1_2464, %int0_2465, %int9223372036854775807_2466, %int1_2467 : !torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %2536, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int2_2468 = torch.constant.int 2
    %2537 = torch.aten.unsqueeze %2536, %int2_2468 : !torch.vtensor<[1,?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %2537, [%548], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int3_2469 = torch.constant.int 3
    %int0_2470 = torch.constant.int 0
    %int9223372036854775807_2471 = torch.constant.int 9223372036854775807
    %int1_2472 = torch.constant.int 1
    %2538 = torch.aten.slice.Tensor %2537, %int3_2469, %int0_2470, %int9223372036854775807_2471, %int1_2472 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %2538, [%548], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int0_2473 = torch.constant.int 0
    %2539 = torch.aten.unsqueeze %2534, %int0_2473 : !torch.vtensor<[?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %2539, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int1_2474 = torch.constant.int 1
    %int0_2475 = torch.constant.int 0
    %int9223372036854775807_2476 = torch.constant.int 9223372036854775807
    %int1_2477 = torch.constant.int 1
    %2540 = torch.aten.slice.Tensor %2539, %int1_2474, %int0_2475, %int9223372036854775807_2476, %int1_2477 : !torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %2540, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int2_2478 = torch.constant.int 2
    %2541 = torch.aten.unsqueeze %2540, %int2_2478 : !torch.vtensor<[1,?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %2541, [%548], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int3_2479 = torch.constant.int 3
    %int0_2480 = torch.constant.int 0
    %int9223372036854775807_2481 = torch.constant.int 9223372036854775807
    %int1_2482 = torch.constant.int 1
    %2542 = torch.aten.slice.Tensor %2541, %int3_2479, %int0_2480, %int9223372036854775807_2481, %int1_2482 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %2542, [%548], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int1_2483 = torch.constant.int 1
    %int2_2484 = torch.constant.int 2
    %2543 = torch.aten.transpose.int %2538, %int1_2483, %int2_2484 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,?,128],bf16>
    torch.bind_symbolic_shape %2543, [%548], affine_map<()[s0] -> (1, 1, s0 * 32, 128)> : !torch.vtensor<[1,1,?,128],bf16>
    %int1_2485 = torch.constant.int 1
    %int1_2486 = torch.constant.int 1
    %int1_2487 = torch.constant.int 1
    %int1_2488 = torch.constant.int 1
    %2544 = torch.prim.ListConstruct %int1_2485, %int1_2486, %int1_2487, %int1_2488 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2545 = torch.aten.repeat %2543, %2544 : !torch.vtensor<[1,1,?,128],bf16>, !torch.list<int> -> !torch.vtensor<[1,1,?,128],bf16>
    torch.bind_symbolic_shape %2545, [%548], affine_map<()[s0] -> (1, 1, s0 * 32, 128)> : !torch.vtensor<[1,1,?,128],bf16>
    %int1_2489 = torch.constant.int 1
    %int2_2490 = torch.constant.int 2
    %2546 = torch.aten.transpose.int %2542, %int1_2489, %int2_2490 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,?,128],bf16>
    torch.bind_symbolic_shape %2546, [%548], affine_map<()[s0] -> (1, 1, s0 * 32, 128)> : !torch.vtensor<[1,1,?,128],bf16>
    %int1_2491 = torch.constant.int 1
    %int2_2492 = torch.constant.int 2
    %2547 = torch.aten.transpose.int %2420, %int1_2491, %int2_2492 : !torch.vtensor<[1,?,8,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,8,?,128],bf16>
    torch.bind_symbolic_shape %2547, [%548], affine_map<()[s0] -> (1, 8, s0 * 32, 128)> : !torch.vtensor<[1,8,?,128],bf16>
    %int1_2493 = torch.constant.int 1
    %int1_2494 = torch.constant.int 1
    %int1_2495 = torch.constant.int 1
    %int1_2496 = torch.constant.int 1
    %2548 = torch.prim.ListConstruct %int1_2493, %int1_2494, %int1_2495, %int1_2496 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2549 = torch.aten.repeat %2546, %2548 : !torch.vtensor<[1,1,?,128],bf16>, !torch.list<int> -> !torch.vtensor<[1,1,?,128],bf16>
    torch.bind_symbolic_shape %2549, [%548], affine_map<()[s0] -> (1, 1, s0 * 32, 128)> : !torch.vtensor<[1,1,?,128],bf16>
    %2550 = torch.aten.mul.Tensor %2547, %2545 : !torch.vtensor<[1,8,?,128],bf16>, !torch.vtensor<[1,1,?,128],bf16> -> !torch.vtensor<[1,8,?,128],bf16>
    torch.bind_symbolic_shape %2550, [%548], affine_map<()[s0] -> (1, 8, s0 * 32, 128)> : !torch.vtensor<[1,8,?,128],bf16>
    %int3_2497 = torch.constant.int 3
    %int0_2498 = torch.constant.int 0
    %int64_2499 = torch.constant.int 64
    %int1_2500 = torch.constant.int 1
    %2551 = torch.aten.slice.Tensor %2547, %int3_2497, %int0_2498, %int64_2499, %int1_2500 : !torch.vtensor<[1,8,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,8,?,64],bf16>
    torch.bind_symbolic_shape %2551, [%548], affine_map<()[s0] -> (1, 8, s0 * 32, 64)> : !torch.vtensor<[1,8,?,64],bf16>
    %int3_2501 = torch.constant.int 3
    %int64_2502 = torch.constant.int 64
    %int9223372036854775807_2503 = torch.constant.int 9223372036854775807
    %int1_2504 = torch.constant.int 1
    %2552 = torch.aten.slice.Tensor %2547, %int3_2501, %int64_2502, %int9223372036854775807_2503, %int1_2504 : !torch.vtensor<[1,8,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,8,?,64],bf16>
    torch.bind_symbolic_shape %2552, [%548], affine_map<()[s0] -> (1, 8, s0 * 32, 64)> : !torch.vtensor<[1,8,?,64],bf16>
    %2553 = torch.aten.neg %2552 : !torch.vtensor<[1,8,?,64],bf16> -> !torch.vtensor<[1,8,?,64],bf16>
    torch.bind_symbolic_shape %2553, [%548], affine_map<()[s0] -> (1, 8, s0 * 32, 64)> : !torch.vtensor<[1,8,?,64],bf16>
    %2554 = torch.prim.ListConstruct %2553, %2551 : (!torch.vtensor<[1,8,?,64],bf16>, !torch.vtensor<[1,8,?,64],bf16>) -> !torch.list<vtensor>
    %int-1_2505 = torch.constant.int -1
    %2555 = torch.aten.cat %2554, %int-1_2505 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,8,?,128],bf16>
    torch.bind_symbolic_shape %2555, [%548], affine_map<()[s0] -> (1, 8, s0 * 32, 128)> : !torch.vtensor<[1,8,?,128],bf16>
    %2556 = torch.aten.mul.Tensor %2555, %2549 : !torch.vtensor<[1,8,?,128],bf16>, !torch.vtensor<[1,1,?,128],bf16> -> !torch.vtensor<[1,8,?,128],bf16>
    torch.bind_symbolic_shape %2556, [%548], affine_map<()[s0] -> (1, 8, s0 * 32, 128)> : !torch.vtensor<[1,8,?,128],bf16>
    %int1_2506 = torch.constant.int 1
    %2557 = torch.aten.add.Tensor %2550, %2556, %int1_2506 : !torch.vtensor<[1,8,?,128],bf16>, !torch.vtensor<[1,8,?,128],bf16>, !torch.int -> !torch.vtensor<[1,8,?,128],bf16>
    torch.bind_symbolic_shape %2557, [%548], affine_map<()[s0] -> (1, 8, s0 * 32, 128)> : !torch.vtensor<[1,8,?,128],bf16>
    %int1_2507 = torch.constant.int 1
    %int2_2508 = torch.constant.int 2
    %2558 = torch.aten.transpose.int %2557, %int1_2507, %int2_2508 : !torch.vtensor<[1,8,?,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %2558, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %2559 = torch.aten.div.Tensor %2558, %110 : !torch.vtensor<[1,?,8,128],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %2559, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %float-2.400000e02_2509 = torch.constant.float -2.400000e+02
    %float2.400000e02_2510 = torch.constant.float 2.400000e+02
    %2560 = torch.aten.clamp %2559, %float-2.400000e02_2509, %float2.400000e02_2510 : !torch.vtensor<[1,?,8,128],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %2560, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %int26_2511 = torch.constant.int 26
    %2561 = torch.prims.convert_element_type %2560, %int26_2511 : !torch.vtensor<[1,?,8,128],bf16>, !torch.int -> !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2561, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
    %2562 = torch.aten.div.Tensor %2422, %110 : !torch.vtensor<[1,?,8,128],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %2562, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %float-2.400000e02_2512 = torch.constant.float -2.400000e+02
    %float2.400000e02_2513 = torch.constant.float 2.400000e+02
    %2563 = torch.aten.clamp %2562, %float-2.400000e02_2512, %float2.400000e02_2513 : !torch.vtensor<[1,?,8,128],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %2563, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %int26_2514 = torch.constant.int 26
    %2564 = torch.prims.convert_element_type %2563, %int26_2514 : !torch.vtensor<[1,?,8,128],bf16>, !torch.int -> !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2564, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
    %int64_2515 = torch.constant.int 64
    %2565 = torch.aten.mul.Scalar %arg2, %int64_2515 : !torch.vtensor<[1,?],si64>, !torch.int -> !torch.vtensor<[1,?],si64>
    torch.bind_symbolic_shape %2565, [%548], affine_map<()[s0] -> (1, s0)> : !torch.vtensor<[1,?],si64>
    %int12 = torch.constant.int 12
    %int1_2516 = torch.constant.int 1
    %2566 = torch.aten.add.Scalar %2565, %int12, %int1_2516 : !torch.vtensor<[1,?],si64>, !torch.int, !torch.int -> !torch.vtensor<[1,?],si64>
    torch.bind_symbolic_shape %2566, [%548], affine_map<()[s0] -> (1, s0)> : !torch.vtensor<[1,?],si64>
    %int1_2517 = torch.constant.int 1
    %int32_2518 = torch.constant.int 32
    %int8_2519 = torch.constant.int 8
    %int128_2520 = torch.constant.int 128
    %2567 = torch.prim.ListConstruct %int1_2517, %748, %int32_2518, %int8_2519, %int128_2520 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2568 = torch.aten.view %2561, %2567 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2568, [%548], affine_map<()[s0] -> (1, s0, 32, 8, 128)> : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
    %int32_2521 = torch.constant.int 32
    %int8_2522 = torch.constant.int 8
    %int128_2523 = torch.constant.int 128
    %2569 = torch.prim.ListConstruct %748, %int32_2521, %int8_2522, %int128_2523 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2570 = torch.aten.view %2568, %2569 : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2570, [%548], affine_map<()[s0] -> (s0, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %2571 = torch.prim.ListConstruct %748 : (!torch.int) -> !torch.list<int>
    %2572 = torch.aten.view %2566, %2571 : !torch.vtensor<[1,?],si64>, !torch.list<int> -> !torch.vtensor<[?],si64>
    torch.bind_symbolic_shape %2572, [%548], affine_map<()[s0] -> (s0)> : !torch.vtensor<[?],si64>
    %int32_2524 = torch.constant.int 32
    %int2_2525 = torch.constant.int 2
    %int32_2526 = torch.constant.int 32
    %int8_2527 = torch.constant.int 8
    %int128_2528 = torch.constant.int 128
    %2573 = torch.prim.ListConstruct %739, %int32_2524, %int2_2525, %int32_2526, %int8_2527, %int128_2528 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2574 = torch.aten.view %2297, %2573 : !torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2574, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    %int32_2529 = torch.constant.int 32
    %2575 = torch.aten.mul.int %739, %int32_2529 : !torch.int, !torch.int -> !torch.int
    %int2_2530 = torch.constant.int 2
    %2576 = torch.aten.mul.int %2575, %int2_2530 : !torch.int, !torch.int -> !torch.int
    %int32_2531 = torch.constant.int 32
    %int8_2532 = torch.constant.int 8
    %int128_2533 = torch.constant.int 128
    %2577 = torch.prim.ListConstruct %2576, %int32_2531, %int8_2532, %int128_2533 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2578 = torch.aten.view %2574, %2577 : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2578, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %2579 = torch.prim.ListConstruct %2572 : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %false_2534 = torch.constant.bool false
    %2580 = torch.aten.index_put %2578, %2579, %2570, %false_2534 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2580, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %int32_2535 = torch.constant.int 32
    %int2_2536 = torch.constant.int 2
    %int32_2537 = torch.constant.int 32
    %int8_2538 = torch.constant.int 8
    %int128_2539 = torch.constant.int 128
    %2581 = torch.prim.ListConstruct %739, %int32_2535, %int2_2536, %int32_2537, %int8_2538, %int128_2539 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2582 = torch.aten.view %2580, %2581 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2582, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    %int2097152_2540 = torch.constant.int 2097152
    %2583 = torch.prim.ListConstruct %739, %int2097152_2540 : (!torch.int, !torch.int) -> !torch.list<int>
    %2584 = torch.aten.view %2582, %2583 : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2584, [%549], affine_map<()[s0] -> (s0, 2097152)> : !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    %int32_2541 = torch.constant.int 32
    %int2_2542 = torch.constant.int 2
    %int32_2543 = torch.constant.int 32
    %int8_2544 = torch.constant.int 8
    %int128_2545 = torch.constant.int 128
    %2585 = torch.prim.ListConstruct %739, %int32_2541, %int2_2542, %int32_2543, %int8_2544, %int128_2545 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2586 = torch.aten.view %2584, %2585 : !torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2586, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    %int32_2546 = torch.constant.int 32
    %int8_2547 = torch.constant.int 8
    %int128_2548 = torch.constant.int 128
    %2587 = torch.prim.ListConstruct %2576, %int32_2546, %int8_2547, %int128_2548 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2588 = torch.aten.view %2586, %2587 : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2588, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %int1_2549 = torch.constant.int 1
    %int32_2550 = torch.constant.int 32
    %int8_2551 = torch.constant.int 8
    %int128_2552 = torch.constant.int 128
    %2589 = torch.prim.ListConstruct %int1_2549, %748, %int32_2550, %int8_2551, %int128_2552 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2590 = torch.aten.view %2564, %2589 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2590, [%548], affine_map<()[s0] -> (1, s0, 32, 8, 128)> : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
    %int32_2553 = torch.constant.int 32
    %int8_2554 = torch.constant.int 8
    %int128_2555 = torch.constant.int 128
    %2591 = torch.prim.ListConstruct %748, %int32_2553, %int8_2554, %int128_2555 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2592 = torch.aten.view %2590, %2591 : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2592, [%548], affine_map<()[s0] -> (s0, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %int1_2556 = torch.constant.int 1
    %int1_2557 = torch.constant.int 1
    %2593 = torch.aten.add.Scalar %2566, %int1_2556, %int1_2557 : !torch.vtensor<[1,?],si64>, !torch.int, !torch.int -> !torch.vtensor<[1,?],si64>
    torch.bind_symbolic_shape %2593, [%548], affine_map<()[s0] -> (1, s0)> : !torch.vtensor<[1,?],si64>
    %2594 = torch.prim.ListConstruct %748 : (!torch.int) -> !torch.list<int>
    %2595 = torch.aten.view %2593, %2594 : !torch.vtensor<[1,?],si64>, !torch.list<int> -> !torch.vtensor<[?],si64>
    torch.bind_symbolic_shape %2595, [%548], affine_map<()[s0] -> (s0)> : !torch.vtensor<[?],si64>
    %2596 = torch.prim.ListConstruct %2595 : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %false_2558 = torch.constant.bool false
    %2597 = torch.aten.index_put %2588, %2596, %2592, %false_2558 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2597, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %int32_2559 = torch.constant.int 32
    %int2_2560 = torch.constant.int 2
    %int32_2561 = torch.constant.int 32
    %int8_2562 = torch.constant.int 8
    %int128_2563 = torch.constant.int 128
    %2598 = torch.prim.ListConstruct %739, %int32_2559, %int2_2560, %int32_2561, %int8_2562, %int128_2563 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2599 = torch.aten.view %2597, %2598 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2599, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    %int2097152_2564 = torch.constant.int 2097152
    %2600 = torch.prim.ListConstruct %739, %int2097152_2564 : (!torch.int, !torch.int) -> !torch.list<int>
    %2601 = torch.aten.view %2599, %2600 : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2601, [%549], affine_map<()[s0] -> (s0, 2097152)> : !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    %int-2_2565 = torch.constant.int -2
    %2602 = torch.aten.unsqueeze %2561, %int-2_2565 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2602, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 1, 128)> : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
    %int1_2566 = torch.constant.int 1
    %int8_2567 = torch.constant.int 8
    %int4_2568 = torch.constant.int 4
    %int128_2569 = torch.constant.int 128
    %2603 = torch.prim.ListConstruct %int1_2566, %2528, %int8_2567, %int4_2568, %int128_2569 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %false_2570 = torch.constant.bool false
    %2604 = torch.aten.expand %2602, %2603, %false_2570 : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2604, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    %int0_2571 = torch.constant.int 0
    %2605 = torch.aten.clone %2604, %int0_2571 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2605, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    %int1_2572 = torch.constant.int 1
    %int32_2573 = torch.constant.int 32
    %int128_2574 = torch.constant.int 128
    %2606 = torch.prim.ListConstruct %int1_2572, %2528, %int32_2573, %int128_2574 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2607 = torch.aten._unsafe_view %2605, %2606 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2607, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
    %int-2_2575 = torch.constant.int -2
    %2608 = torch.aten.unsqueeze %2564, %int-2_2575 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2608, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 1, 128)> : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
    %int1_2576 = torch.constant.int 1
    %2609 = torch.aten.size.int %2415, %int1_2576 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.int
    %int1_2577 = torch.constant.int 1
    %int8_2578 = torch.constant.int 8
    %int4_2579 = torch.constant.int 4
    %int128_2580 = torch.constant.int 128
    %2610 = torch.prim.ListConstruct %int1_2577, %2609, %int8_2578, %int4_2579, %int128_2580 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %false_2581 = torch.constant.bool false
    %2611 = torch.aten.expand %2608, %2610, %false_2581 : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2611, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    %int0_2582 = torch.constant.int 0
    %2612 = torch.aten.clone %2611, %int0_2582 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2612, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    %int1_2583 = torch.constant.int 1
    %int32_2584 = torch.constant.int 32
    %int128_2585 = torch.constant.int 128
    %2613 = torch.prim.ListConstruct %int1_2583, %2609, %int32_2584, %int128_2585 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2614 = torch.aten._unsafe_view %2612, %2613 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2614, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
    %int6_2586 = torch.constant.int 6
    %2615 = torch.prims.convert_element_type %2607, %int6_2586 : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,32,128],f32>
    torch.bind_symbolic_shape %2615, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
    %2616 = torch.aten.mul.Tensor %2615, %110 : !torch.vtensor<[1,?,32,128],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,32,128],f32>
    torch.bind_symbolic_shape %2616, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
    %int15_2587 = torch.constant.int 15
    %2617 = torch.prims.convert_element_type %2616, %int15_2587 : !torch.vtensor<[1,?,32,128],f32>, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
    torch.bind_symbolic_shape %2617, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
    %int6_2588 = torch.constant.int 6
    %2618 = torch.prims.convert_element_type %2614, %int6_2588 : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,32,128],f32>
    torch.bind_symbolic_shape %2618, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
    %2619 = torch.aten.mul.Tensor %2618, %110 : !torch.vtensor<[1,?,32,128],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,32,128],f32>
    torch.bind_symbolic_shape %2619, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
    %int15_2589 = torch.constant.int 15
    %2620 = torch.prims.convert_element_type %2619, %int15_2589 : !torch.vtensor<[1,?,32,128],f32>, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
    torch.bind_symbolic_shape %2620, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
    %int1_2590 = torch.constant.int 1
    %int2_2591 = torch.constant.int 2
    %2621 = torch.aten.transpose.int %2490, %int1_2590, %int2_2591 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
    torch.bind_symbolic_shape %2621, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %int1_2592 = torch.constant.int 1
    %int2_2593 = torch.constant.int 2
    %2622 = torch.aten.transpose.int %2617, %int1_2592, %int2_2593 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
    torch.bind_symbolic_shape %2622, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %int1_2594 = torch.constant.int 1
    %int2_2595 = torch.constant.int 2
    %2623 = torch.aten.transpose.int %2620, %int1_2594, %int2_2595 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
    torch.bind_symbolic_shape %2623, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %float0.000000e00_2596 = torch.constant.float 0.000000e+00
    %true_2597 = torch.constant.bool true
    %none_2598 = torch.constant.none
    %none_2599 = torch.constant.none
    %2624:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%2621, %2622, %2623, %float0.000000e00_2596, %true_2597, %none_2598, %none_2599) : (!torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,32,?,128],bf16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,32,?],f32>) 
    torch.bind_symbolic_shape %2624#0, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %int1_2600 = torch.constant.int 1
    %int2_2601 = torch.constant.int 2
    %2625 = torch.aten.transpose.int %2624#0, %int1_2600, %int2_2601 : !torch.vtensor<[1,32,?,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
    torch.bind_symbolic_shape %2625, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
    %int1_2602 = torch.constant.int 1
    %int4096_2603 = torch.constant.int 4096
    %2626 = torch.prim.ListConstruct %int1_2602, %2460, %int4096_2603 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2627 = torch.aten.view %2625, %2626 : !torch.vtensor<[1,?,32,128],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2627, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %2628 = torch.aten.div.Tensor %2627, %111 : !torch.vtensor<[1,?,4096],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2628, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %float-2.400000e02_2604 = torch.constant.float -2.400000e+02
    %float2.400000e02_2605 = torch.constant.float 2.400000e+02
    %2629 = torch.aten.clamp %2628, %float-2.400000e02_2604, %float2.400000e02_2605 : !torch.vtensor<[1,?,4096],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2629, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int26_2606 = torch.constant.int 26
    %2630 = torch.prims.convert_element_type %2629, %int26_2606 : !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2630, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_2607 = torch.constant.int -2
    %int-1_2608 = torch.constant.int -1
    %2631 = torch.aten.transpose.int %112, %int-2_2607, %int-1_2608 : !torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %int4096_2609 = torch.constant.int 4096
    %2632 = torch.prim.ListConstruct %2460, %int4096_2609 : (!torch.int, !torch.int) -> !torch.list<int>
    %2633 = torch.aten.view %2630, %2632 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2633, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %2634 = torch.aten.mm %2633, %2631 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2634, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %int1_2610 = torch.constant.int 1
    %int4096_2611 = torch.constant.int 4096
    %2635 = torch.prim.ListConstruct %int1_2610, %2460, %int4096_2611 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2636 = torch.aten.view %2634, %2635 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2636, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int15_2612 = torch.constant.int 15
    %2637 = torch.prims.convert_element_type %2636, %int15_2612 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2637, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int1_2613 = torch.constant.int 1
    %2638 = torch.aten.add.Tensor %2377, %2637, %int1_2613 : !torch.vtensor<[1,?,4096],bf16>, !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2638, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int6_2614 = torch.constant.int 6
    %2639 = torch.prims.convert_element_type %2638, %int6_2614 : !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %2639, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int2_2615 = torch.constant.int 2
    %2640 = torch.aten.pow.Tensor_Scalar %2639, %int2_2615 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %2640, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int-1_2616 = torch.constant.int -1
    %2641 = torch.prim.ListConstruct %int-1_2616 : (!torch.int) -> !torch.list<int>
    %true_2617 = torch.constant.bool true
    %none_2618 = torch.constant.none
    %2642 = torch.aten.mean.dim %2640, %2641, %true_2617, %none_2618 : !torch.vtensor<[1,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %2642, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %float1.000000e-05_2619 = torch.constant.float 1.000000e-05
    %int1_2620 = torch.constant.int 1
    %2643 = torch.aten.add.Scalar %2642, %float1.000000e-05_2619, %int1_2620 : !torch.vtensor<[1,?,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %2643, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %2644 = torch.aten.rsqrt %2643 : !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %2644, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %2645 = torch.aten.mul.Tensor %2639, %2644 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %2645, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int15_2621 = torch.constant.int 15
    %2646 = torch.prims.convert_element_type %2645, %int15_2621 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2646, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %2647 = torch.aten.mul.Tensor %113, %2646 : !torch.vtensor<[4096],bf16>, !torch.vtensor<[1,?,4096],bf16> -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2647, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %2648 = torch.aten.div.Tensor %2647, %114 : !torch.vtensor<[1,?,4096],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2648, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %float-2.400000e02_2622 = torch.constant.float -2.400000e+02
    %float2.400000e02_2623 = torch.constant.float 2.400000e+02
    %2649 = torch.aten.clamp %2648, %float-2.400000e02_2622, %float2.400000e02_2623 : !torch.vtensor<[1,?,4096],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2649, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int26_2624 = torch.constant.int 26
    %2650 = torch.prims.convert_element_type %2649, %int26_2624 : !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2650, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_2625 = torch.constant.int -2
    %int-1_2626 = torch.constant.int -1
    %2651 = torch.aten.transpose.int %115, %int-2_2625, %int-1_2626 : !torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %int4096_2627 = torch.constant.int 4096
    %2652 = torch.prim.ListConstruct %564, %int4096_2627 : (!torch.int, !torch.int) -> !torch.list<int>
    %2653 = torch.aten.view %2650, %2652 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2653, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %2654 = torch.aten.mm %2653, %2651 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,14336],f8E4M3FNUZ> -> !torch.vtensor<[?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2654, [%548], affine_map<()[s0] -> (s0 * 32, 14336)> : !torch.vtensor<[?,14336],f8E4M3FNUZ>
    %int1_2628 = torch.constant.int 1
    %int14336_2629 = torch.constant.int 14336
    %2655 = torch.prim.ListConstruct %int1_2628, %564, %int14336_2629 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2656 = torch.aten.view %2654, %2655 : !torch.vtensor<[?,14336],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2656, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    %int15_2630 = torch.constant.int 15
    %2657 = torch.prims.convert_element_type %2656, %int15_2630 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %2657, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %2658 = torch.aten.silu %2657 : !torch.vtensor<[1,?,14336],bf16> -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %2658, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %2659 = torch.aten.div.Tensor %2647, %116 : !torch.vtensor<[1,?,4096],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2659, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %float-2.400000e02_2631 = torch.constant.float -2.400000e+02
    %float2.400000e02_2632 = torch.constant.float 2.400000e+02
    %2660 = torch.aten.clamp %2659, %float-2.400000e02_2631, %float2.400000e02_2632 : !torch.vtensor<[1,?,4096],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2660, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int26_2633 = torch.constant.int 26
    %2661 = torch.prims.convert_element_type %2660, %int26_2633 : !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2661, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_2634 = torch.constant.int -2
    %int-1_2635 = torch.constant.int -1
    %2662 = torch.aten.transpose.int %117, %int-2_2634, %int-1_2635 : !torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %int4096_2636 = torch.constant.int 4096
    %2663 = torch.prim.ListConstruct %564, %int4096_2636 : (!torch.int, !torch.int) -> !torch.list<int>
    %2664 = torch.aten.view %2661, %2663 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2664, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %2665 = torch.aten.mm %2664, %2662 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,14336],f8E4M3FNUZ> -> !torch.vtensor<[?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2665, [%548], affine_map<()[s0] -> (s0 * 32, 14336)> : !torch.vtensor<[?,14336],f8E4M3FNUZ>
    %int1_2637 = torch.constant.int 1
    %int14336_2638 = torch.constant.int 14336
    %2666 = torch.prim.ListConstruct %int1_2637, %564, %int14336_2638 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2667 = torch.aten.view %2665, %2666 : !torch.vtensor<[?,14336],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2667, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    %int15_2639 = torch.constant.int 15
    %2668 = torch.prims.convert_element_type %2667, %int15_2639 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %2668, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %2669 = torch.aten.mul.Tensor %2658, %2668 : !torch.vtensor<[1,?,14336],bf16>, !torch.vtensor<[1,?,14336],bf16> -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %2669, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %2670 = torch.aten.div.Tensor %2669, %118 : !torch.vtensor<[1,?,14336],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %2670, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %float-2.400000e02_2640 = torch.constant.float -2.400000e+02
    %float2.400000e02_2641 = torch.constant.float 2.400000e+02
    %2671 = torch.aten.clamp %2670, %float-2.400000e02_2640, %float2.400000e02_2641 : !torch.vtensor<[1,?,14336],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %2671, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %int26_2642 = torch.constant.int 26
    %2672 = torch.prims.convert_element_type %2671, %int26_2642 : !torch.vtensor<[1,?,14336],bf16>, !torch.int -> !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2672, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    %int-2_2643 = torch.constant.int -2
    %int-1_2644 = torch.constant.int -1
    %2673 = torch.aten.transpose.int %119, %int-2_2643, %int-1_2644 : !torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %int1_2645 = torch.constant.int 1
    %2674 = torch.aten.size.int %2656, %int1_2645 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.int -> !torch.int
    %int14336_2646 = torch.constant.int 14336
    %2675 = torch.prim.ListConstruct %2674, %int14336_2646 : (!torch.int, !torch.int) -> !torch.list<int>
    %2676 = torch.aten.view %2672, %2675 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2676, [%548], affine_map<()[s0] -> (s0 * 32, 14336)> : !torch.vtensor<[?,14336],f8E4M3FNUZ>
    %2677 = torch.aten.mm %2676, %2673 : !torch.vtensor<[?,14336],f8E4M3FNUZ>, !torch.vtensor<[14336,4096],f8E4M3FNUZ> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2677, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %int1_2647 = torch.constant.int 1
    %int4096_2648 = torch.constant.int 4096
    %2678 = torch.prim.ListConstruct %int1_2647, %2674, %int4096_2648 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2679 = torch.aten.view %2677, %2678 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2679, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int15_2649 = torch.constant.int 15
    %2680 = torch.prims.convert_element_type %2679, %int15_2649 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2680, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int1_2650 = torch.constant.int 1
    %2681 = torch.aten.add.Tensor %2638, %2680, %int1_2650 : !torch.vtensor<[1,?,4096],bf16>, !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2681, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int6_2651 = torch.constant.int 6
    %2682 = torch.prims.convert_element_type %2681, %int6_2651 : !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %2682, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int2_2652 = torch.constant.int 2
    %2683 = torch.aten.pow.Tensor_Scalar %2682, %int2_2652 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %2683, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int-1_2653 = torch.constant.int -1
    %2684 = torch.prim.ListConstruct %int-1_2653 : (!torch.int) -> !torch.list<int>
    %true_2654 = torch.constant.bool true
    %none_2655 = torch.constant.none
    %2685 = torch.aten.mean.dim %2683, %2684, %true_2654, %none_2655 : !torch.vtensor<[1,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %2685, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %float1.000000e-05_2656 = torch.constant.float 1.000000e-05
    %int1_2657 = torch.constant.int 1
    %2686 = torch.aten.add.Scalar %2685, %float1.000000e-05_2656, %int1_2657 : !torch.vtensor<[1,?,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %2686, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %2687 = torch.aten.rsqrt %2686 : !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %2687, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %2688 = torch.aten.mul.Tensor %2682, %2687 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %2688, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int15_2658 = torch.constant.int 15
    %2689 = torch.prims.convert_element_type %2688, %int15_2658 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2689, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %2690 = torch.aten.mul.Tensor %120, %2689 : !torch.vtensor<[4096],bf16>, !torch.vtensor<[1,?,4096],bf16> -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2690, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %2691 = torch.aten.div.Tensor %2690, %121 : !torch.vtensor<[1,?,4096],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2691, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %float-2.400000e02_2659 = torch.constant.float -2.400000e+02
    %float2.400000e02_2660 = torch.constant.float 2.400000e+02
    %2692 = torch.aten.clamp %2691, %float-2.400000e02_2659, %float2.400000e02_2660 : !torch.vtensor<[1,?,4096],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2692, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int26_2661 = torch.constant.int 26
    %2693 = torch.prims.convert_element_type %2692, %int26_2661 : !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2693, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_2662 = torch.constant.int -2
    %int-1_2663 = torch.constant.int -1
    %2694 = torch.aten.transpose.int %122, %int-2_2662, %int-1_2663 : !torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %int4096_2664 = torch.constant.int 4096
    %2695 = torch.prim.ListConstruct %564, %int4096_2664 : (!torch.int, !torch.int) -> !torch.list<int>
    %2696 = torch.aten.view %2693, %2695 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2696, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %2697 = torch.aten.mm %2696, %2694 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2697, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %int1_2665 = torch.constant.int 1
    %int4096_2666 = torch.constant.int 4096
    %2698 = torch.prim.ListConstruct %int1_2665, %564, %int4096_2666 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2699 = torch.aten.view %2697, %2698 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2699, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int15_2667 = torch.constant.int 15
    %2700 = torch.prims.convert_element_type %2699, %int15_2667 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2700, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %2701 = torch.aten.div.Tensor %2690, %123 : !torch.vtensor<[1,?,4096],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2701, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %float-2.400000e02_2668 = torch.constant.float -2.400000e+02
    %float2.400000e02_2669 = torch.constant.float 2.400000e+02
    %2702 = torch.aten.clamp %2701, %float-2.400000e02_2668, %float2.400000e02_2669 : !torch.vtensor<[1,?,4096],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2702, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int26_2670 = torch.constant.int 26
    %2703 = torch.prims.convert_element_type %2702, %int26_2670 : !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2703, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_2671 = torch.constant.int -2
    %int-1_2672 = torch.constant.int -1
    %2704 = torch.aten.transpose.int %124, %int-2_2671, %int-1_2672 : !torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,1024],f8E4M3FNUZ>
    %int4096_2673 = torch.constant.int 4096
    %2705 = torch.prim.ListConstruct %564, %int4096_2673 : (!torch.int, !torch.int) -> !torch.list<int>
    %2706 = torch.aten.view %2703, %2705 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2706, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %2707 = torch.aten.mm %2706, %2704 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,1024],f8E4M3FNUZ> -> !torch.vtensor<[?,1024],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2707, [%548], affine_map<()[s0] -> (s0 * 32, 1024)> : !torch.vtensor<[?,1024],f8E4M3FNUZ>
    %int1_2674 = torch.constant.int 1
    %int1024_2675 = torch.constant.int 1024
    %2708 = torch.prim.ListConstruct %int1_2674, %564, %int1024_2675 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2709 = torch.aten.view %2707, %2708 : !torch.vtensor<[?,1024],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2709, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
    %int15_2676 = torch.constant.int 15
    %2710 = torch.prims.convert_element_type %2709, %int15_2676 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,1024],bf16>
    torch.bind_symbolic_shape %2710, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],bf16>
    %2711 = torch.aten.div.Tensor %2690, %125 : !torch.vtensor<[1,?,4096],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2711, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %float-2.400000e02_2677 = torch.constant.float -2.400000e+02
    %float2.400000e02_2678 = torch.constant.float 2.400000e+02
    %2712 = torch.aten.clamp %2711, %float-2.400000e02_2677, %float2.400000e02_2678 : !torch.vtensor<[1,?,4096],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2712, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int26_2679 = torch.constant.int 26
    %2713 = torch.prims.convert_element_type %2712, %int26_2679 : !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2713, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_2680 = torch.constant.int -2
    %int-1_2681 = torch.constant.int -1
    %2714 = torch.aten.transpose.int %126, %int-2_2680, %int-1_2681 : !torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,1024],f8E4M3FNUZ>
    %int4096_2682 = torch.constant.int 4096
    %2715 = torch.prim.ListConstruct %564, %int4096_2682 : (!torch.int, !torch.int) -> !torch.list<int>
    %2716 = torch.aten.view %2713, %2715 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2716, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %2717 = torch.aten.mm %2716, %2714 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,1024],f8E4M3FNUZ> -> !torch.vtensor<[?,1024],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2717, [%548], affine_map<()[s0] -> (s0 * 32, 1024)> : !torch.vtensor<[?,1024],f8E4M3FNUZ>
    %int1_2683 = torch.constant.int 1
    %int1024_2684 = torch.constant.int 1024
    %2718 = torch.prim.ListConstruct %int1_2683, %564, %int1024_2684 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2719 = torch.aten.view %2717, %2718 : !torch.vtensor<[?,1024],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2719, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
    %int15_2685 = torch.constant.int 15
    %2720 = torch.prims.convert_element_type %2719, %int15_2685 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,1024],bf16>
    torch.bind_symbolic_shape %2720, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],bf16>
    %int1_2686 = torch.constant.int 1
    %int32_2687 = torch.constant.int 32
    %int128_2688 = torch.constant.int 128
    %2721 = torch.prim.ListConstruct %int1_2686, %564, %int32_2687, %int128_2688 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2722 = torch.aten.view %2700, %2721 : !torch.vtensor<[1,?,4096],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,32,128],bf16>
    torch.bind_symbolic_shape %2722, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
    %int1_2689 = torch.constant.int 1
    %int8_2690 = torch.constant.int 8
    %int128_2691 = torch.constant.int 128
    %2723 = torch.prim.ListConstruct %int1_2689, %564, %int8_2690, %int128_2691 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2724 = torch.aten.view %2710, %2723 : !torch.vtensor<[1,?,1024],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %2724, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %int1_2692 = torch.constant.int 1
    %int8_2693 = torch.constant.int 8
    %int128_2694 = torch.constant.int 128
    %2725 = torch.prim.ListConstruct %int1_2692, %564, %int8_2693, %int128_2694 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2726 = torch.aten.view %2720, %2725 : !torch.vtensor<[1,?,1024],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %2726, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %int131072_2695 = torch.constant.int 131072
    %none_2696 = torch.constant.none
    %none_2697 = torch.constant.none
    %cpu_2698 = torch.constant.device "cpu"
    %false_2699 = torch.constant.bool false
    %2727 = torch.aten.arange %int131072_2695, %none_2696, %none_2697, %cpu_2698, %false_2699 : !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[131072],si64>
    %int0_2700 = torch.constant.int 0
    %int128_2701 = torch.constant.int 128
    %int2_2702 = torch.constant.int 2
    %int4_2703 = torch.constant.int 4
    %none_2704 = torch.constant.none
    %cpu_2705 = torch.constant.device "cpu"
    %false_2706 = torch.constant.bool false
    %2728 = torch.aten.arange.start_step %int0_2700, %int128_2701, %int2_2702, %int4_2703, %none_2704, %cpu_2705, %false_2706 : !torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[64],si64>
    %int6_2707 = torch.constant.int 6
    %2729 = torch.prims.convert_element_type %2728, %int6_2707 : !torch.vtensor<[64],si64>, !torch.int -> !torch.vtensor<[64],f32>
    %int128_2708 = torch.constant.int 128
    %2730 = torch.aten.div.Scalar %2729, %int128_2708 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %float5.000000e05_2709 = torch.constant.float 5.000000e+05
    %2731 = torch.aten.pow.Scalar %float5.000000e05_2709, %2730 : !torch.float, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %2732 = torch.aten.reciprocal %2731 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %float1.000000e00_2710 = torch.constant.float 1.000000e+00
    %2733 = torch.aten.mul.Scalar %2732, %float1.000000e00_2710 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],f32>
    %2734 = torch.aten.reciprocal %2733 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %float6.283190e00_2711 = torch.constant.float 6.2831853071795862
    %2735 = torch.aten.mul.Scalar %2734, %float6.283190e00_2711 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],f32>
    %float8.192000e03_2712 = torch.constant.float 8.192000e+03
    %2736 = torch.aten.gt.Scalar %2735, %float8.192000e03_2712 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1>
    %int8_2713 = torch.constant.int 8
    %2737 = torch.aten.div.Scalar %2733, %int8_2713 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %2738 = torch.aten.where.self %2736, %2737, %2733 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %2739 = torch.aten.reciprocal %2735 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %int8192_2714 = torch.constant.int 8192
    %2740 = torch.aten.mul.Scalar %2739, %int8192_2714 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %int1_2715 = torch.constant.int 1
    %int1_2716 = torch.constant.int 1
    %2741 = torch.aten.sub.Scalar %2740, %int1_2715, %int1_2716 : !torch.vtensor<[64],f32>, !torch.int, !torch.int -> !torch.vtensor<[64],f32>
    %int3_2717 = torch.constant.int 3
    %2742 = torch.aten.div.Scalar %2741, %int3_2717 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %int1_2718 = torch.constant.int 1
    %int1_2719 = torch.constant.int 1
    %2743 = torch.aten.rsub.Scalar %2742, %int1_2718, %int1_2719 : !torch.vtensor<[64],f32>, !torch.int, !torch.int -> !torch.vtensor<[64],f32>
    %2744 = torch.aten.mul.Tensor %2743, %2738 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %int8_2720 = torch.constant.int 8
    %2745 = torch.aten.div.Scalar %2744, %int8_2720 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %2746 = torch.aten.mul.Tensor %2742, %2738 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %int1_2721 = torch.constant.int 1
    %2747 = torch.aten.add.Tensor %2745, %2746, %int1_2721 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %float2.048000e03_2722 = torch.constant.float 2.048000e+03
    %2748 = torch.aten.lt.Scalar %2735, %float2.048000e03_2722 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1>
    %2749 = torch.aten.bitwise_not %2748 : !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1>
    %float8.192000e03_2723 = torch.constant.float 8.192000e+03
    %2750 = torch.aten.gt.Scalar %2735, %float8.192000e03_2723 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1>
    %2751 = torch.aten.bitwise_not %2750 : !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1>
    %2752 = torch.aten.mul.Tensor %2749, %2751 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1>
    %2753 = torch.aten.where.self %2752, %2747, %2738 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %2754 = torch.prim.ListConstruct %2753, %2753 : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor>
    %int-1_2724 = torch.constant.int -1
    %2755 = torch.aten.cat %2754, %int-1_2724 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[128],f32>
    %int6_2725 = torch.constant.int 6
    %2756 = torch.prims.convert_element_type %2727, %int6_2725 : !torch.vtensor<[131072],si64>, !torch.int -> !torch.vtensor<[131072],f32>
    %int131072_2726 = torch.constant.int 131072
    %int1_2727 = torch.constant.int 1
    %2757 = torch.prim.ListConstruct %int131072_2726, %int1_2727 : (!torch.int, !torch.int) -> !torch.list<int>
    %2758 = torch.aten.view %2756, %2757 : !torch.vtensor<[131072],f32>, !torch.list<int> -> !torch.vtensor<[131072,1],f32>
    %2759 = torch.aten.mul.Tensor %2758, %2755 : !torch.vtensor<[131072,1],f32>, !torch.vtensor<[128],f32> -> !torch.vtensor<[131072,128],f32>
    %2760 = torch.aten.cos %2759 : !torch.vtensor<[131072,128],f32> -> !torch.vtensor<[131072,128],f32>
    %int15_2728 = torch.constant.int 15
    %2761 = torch.prims.convert_element_type %2760, %int15_2728 : !torch.vtensor<[131072,128],f32>, !torch.int -> !torch.vtensor<[131072,128],bf16>
    %2762 = torch.aten.sin %2759 : !torch.vtensor<[131072,128],f32> -> !torch.vtensor<[131072,128],f32>
    %int15_2729 = torch.constant.int 15
    %2763 = torch.prims.convert_element_type %2762, %int15_2729 : !torch.vtensor<[131072,128],f32>, !torch.int -> !torch.vtensor<[131072,128],bf16>
    %int1_2730 = torch.constant.int 1
    %2764 = torch.aten.size.int %2699, %int1_2730 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.int
    %int0_2731 = torch.constant.int 0
    %2765 = torch.aten.add.int %int0_2731, %2764 : !torch.int, !torch.int -> !torch.int
    %int0_2732 = torch.constant.int 0
    %int0_2733 = torch.constant.int 0
    %int1_2734 = torch.constant.int 1
    %2766 = torch.aten.slice.Tensor %2761, %int0_2732, %int0_2733, %2765, %int1_2734 : !torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %2766, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int1_2735 = torch.constant.int 1
    %int0_2736 = torch.constant.int 0
    %int9223372036854775807_2737 = torch.constant.int 9223372036854775807
    %int1_2738 = torch.constant.int 1
    %2767 = torch.aten.slice.Tensor %2766, %int1_2735, %int0_2736, %int9223372036854775807_2737, %int1_2738 : !torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %2767, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int0_2739 = torch.constant.int 0
    %2768 = torch.aten.add.int %int0_2739, %2764 : !torch.int, !torch.int -> !torch.int
    %int0_2740 = torch.constant.int 0
    %int0_2741 = torch.constant.int 0
    %int1_2742 = torch.constant.int 1
    %2769 = torch.aten.slice.Tensor %2763, %int0_2740, %int0_2741, %2768, %int1_2742 : !torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %2769, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int1_2743 = torch.constant.int 1
    %int0_2744 = torch.constant.int 0
    %int9223372036854775807_2745 = torch.constant.int 9223372036854775807
    %int1_2746 = torch.constant.int 1
    %2770 = torch.aten.slice.Tensor %2769, %int1_2743, %int0_2744, %int9223372036854775807_2745, %int1_2746 : !torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %2770, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int0_2747 = torch.constant.int 0
    %2771 = torch.aten.unsqueeze %2767, %int0_2747 : !torch.vtensor<[?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %2771, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int1_2748 = torch.constant.int 1
    %int0_2749 = torch.constant.int 0
    %int9223372036854775807_2750 = torch.constant.int 9223372036854775807
    %int1_2751 = torch.constant.int 1
    %2772 = torch.aten.slice.Tensor %2771, %int1_2748, %int0_2749, %int9223372036854775807_2750, %int1_2751 : !torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %2772, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int2_2752 = torch.constant.int 2
    %2773 = torch.aten.unsqueeze %2772, %int2_2752 : !torch.vtensor<[1,?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %2773, [%548], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int3_2753 = torch.constant.int 3
    %int0_2754 = torch.constant.int 0
    %int9223372036854775807_2755 = torch.constant.int 9223372036854775807
    %int1_2756 = torch.constant.int 1
    %2774 = torch.aten.slice.Tensor %2773, %int3_2753, %int0_2754, %int9223372036854775807_2755, %int1_2756 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %2774, [%548], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int0_2757 = torch.constant.int 0
    %2775 = torch.aten.unsqueeze %2770, %int0_2757 : !torch.vtensor<[?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %2775, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int1_2758 = torch.constant.int 1
    %int0_2759 = torch.constant.int 0
    %int9223372036854775807_2760 = torch.constant.int 9223372036854775807
    %int1_2761 = torch.constant.int 1
    %2776 = torch.aten.slice.Tensor %2775, %int1_2758, %int0_2759, %int9223372036854775807_2760, %int1_2761 : !torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %2776, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int2_2762 = torch.constant.int 2
    %2777 = torch.aten.unsqueeze %2776, %int2_2762 : !torch.vtensor<[1,?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %2777, [%548], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int3_2763 = torch.constant.int 3
    %int0_2764 = torch.constant.int 0
    %int9223372036854775807_2765 = torch.constant.int 9223372036854775807
    %int1_2766 = torch.constant.int 1
    %2778 = torch.aten.slice.Tensor %2777, %int3_2763, %int0_2764, %int9223372036854775807_2765, %int1_2766 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %2778, [%548], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int1_2767 = torch.constant.int 1
    %int2_2768 = torch.constant.int 2
    %2779 = torch.aten.transpose.int %2774, %int1_2767, %int2_2768 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,?,128],bf16>
    torch.bind_symbolic_shape %2779, [%548], affine_map<()[s0] -> (1, 1, s0 * 32, 128)> : !torch.vtensor<[1,1,?,128],bf16>
    %int1_2769 = torch.constant.int 1
    %int1_2770 = torch.constant.int 1
    %int1_2771 = torch.constant.int 1
    %int1_2772 = torch.constant.int 1
    %2780 = torch.prim.ListConstruct %int1_2769, %int1_2770, %int1_2771, %int1_2772 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2781 = torch.aten.repeat %2779, %2780 : !torch.vtensor<[1,1,?,128],bf16>, !torch.list<int> -> !torch.vtensor<[1,1,?,128],bf16>
    torch.bind_symbolic_shape %2781, [%548], affine_map<()[s0] -> (1, 1, s0 * 32, 128)> : !torch.vtensor<[1,1,?,128],bf16>
    %int1_2773 = torch.constant.int 1
    %int2_2774 = torch.constant.int 2
    %2782 = torch.aten.transpose.int %2778, %int1_2773, %int2_2774 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,?,128],bf16>
    torch.bind_symbolic_shape %2782, [%548], affine_map<()[s0] -> (1, 1, s0 * 32, 128)> : !torch.vtensor<[1,1,?,128],bf16>
    %int1_2775 = torch.constant.int 1
    %int2_2776 = torch.constant.int 2
    %2783 = torch.aten.transpose.int %2722, %int1_2775, %int2_2776 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
    torch.bind_symbolic_shape %2783, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %int1_2777 = torch.constant.int 1
    %int1_2778 = torch.constant.int 1
    %int1_2779 = torch.constant.int 1
    %int1_2780 = torch.constant.int 1
    %2784 = torch.prim.ListConstruct %int1_2777, %int1_2778, %int1_2779, %int1_2780 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2785 = torch.aten.repeat %2782, %2784 : !torch.vtensor<[1,1,?,128],bf16>, !torch.list<int> -> !torch.vtensor<[1,1,?,128],bf16>
    torch.bind_symbolic_shape %2785, [%548], affine_map<()[s0] -> (1, 1, s0 * 32, 128)> : !torch.vtensor<[1,1,?,128],bf16>
    %2786 = torch.aten.mul.Tensor %2783, %2781 : !torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,1,?,128],bf16> -> !torch.vtensor<[1,32,?,128],bf16>
    torch.bind_symbolic_shape %2786, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %int3_2781 = torch.constant.int 3
    %int0_2782 = torch.constant.int 0
    %int64_2783 = torch.constant.int 64
    %int1_2784 = torch.constant.int 1
    %2787 = torch.aten.slice.Tensor %2783, %int3_2781, %int0_2782, %int64_2783, %int1_2784 : !torch.vtensor<[1,32,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,64],bf16>
    torch.bind_symbolic_shape %2787, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 64)> : !torch.vtensor<[1,32,?,64],bf16>
    %int3_2785 = torch.constant.int 3
    %int64_2786 = torch.constant.int 64
    %int9223372036854775807_2787 = torch.constant.int 9223372036854775807
    %int1_2788 = torch.constant.int 1
    %2788 = torch.aten.slice.Tensor %2783, %int3_2785, %int64_2786, %int9223372036854775807_2787, %int1_2788 : !torch.vtensor<[1,32,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,64],bf16>
    torch.bind_symbolic_shape %2788, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 64)> : !torch.vtensor<[1,32,?,64],bf16>
    %2789 = torch.aten.neg %2788 : !torch.vtensor<[1,32,?,64],bf16> -> !torch.vtensor<[1,32,?,64],bf16>
    torch.bind_symbolic_shape %2789, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 64)> : !torch.vtensor<[1,32,?,64],bf16>
    %2790 = torch.prim.ListConstruct %2789, %2787 : (!torch.vtensor<[1,32,?,64],bf16>, !torch.vtensor<[1,32,?,64],bf16>) -> !torch.list<vtensor>
    %int-1_2789 = torch.constant.int -1
    %2791 = torch.aten.cat %2790, %int-1_2789 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
    torch.bind_symbolic_shape %2791, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %2792 = torch.aten.mul.Tensor %2791, %2785 : !torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,1,?,128],bf16> -> !torch.vtensor<[1,32,?,128],bf16>
    torch.bind_symbolic_shape %2792, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %int1_2790 = torch.constant.int 1
    %2793 = torch.aten.add.Tensor %2786, %2792, %int1_2790 : !torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,32,?,128],bf16>, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
    torch.bind_symbolic_shape %2793, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %int1_2791 = torch.constant.int 1
    %int2_2792 = torch.constant.int 2
    %2794 = torch.aten.transpose.int %2793, %int1_2791, %int2_2792 : !torch.vtensor<[1,32,?,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
    torch.bind_symbolic_shape %2794, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
    %int131072_2793 = torch.constant.int 131072
    %none_2794 = torch.constant.none
    %none_2795 = torch.constant.none
    %cpu_2796 = torch.constant.device "cpu"
    %false_2797 = torch.constant.bool false
    %2795 = torch.aten.arange %int131072_2793, %none_2794, %none_2795, %cpu_2796, %false_2797 : !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[131072],si64>
    %int0_2798 = torch.constant.int 0
    %int128_2799 = torch.constant.int 128
    %int2_2800 = torch.constant.int 2
    %int4_2801 = torch.constant.int 4
    %none_2802 = torch.constant.none
    %cpu_2803 = torch.constant.device "cpu"
    %false_2804 = torch.constant.bool false
    %2796 = torch.aten.arange.start_step %int0_2798, %int128_2799, %int2_2800, %int4_2801, %none_2802, %cpu_2803, %false_2804 : !torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[64],si64>
    %int6_2805 = torch.constant.int 6
    %2797 = torch.prims.convert_element_type %2796, %int6_2805 : !torch.vtensor<[64],si64>, !torch.int -> !torch.vtensor<[64],f32>
    %int128_2806 = torch.constant.int 128
    %2798 = torch.aten.div.Scalar %2797, %int128_2806 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %float5.000000e05_2807 = torch.constant.float 5.000000e+05
    %2799 = torch.aten.pow.Scalar %float5.000000e05_2807, %2798 : !torch.float, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %2800 = torch.aten.reciprocal %2799 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %float1.000000e00_2808 = torch.constant.float 1.000000e+00
    %2801 = torch.aten.mul.Scalar %2800, %float1.000000e00_2808 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],f32>
    %2802 = torch.aten.reciprocal %2801 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %float6.283190e00_2809 = torch.constant.float 6.2831853071795862
    %2803 = torch.aten.mul.Scalar %2802, %float6.283190e00_2809 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],f32>
    %float8.192000e03_2810 = torch.constant.float 8.192000e+03
    %2804 = torch.aten.gt.Scalar %2803, %float8.192000e03_2810 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1>
    %int8_2811 = torch.constant.int 8
    %2805 = torch.aten.div.Scalar %2801, %int8_2811 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %2806 = torch.aten.where.self %2804, %2805, %2801 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %2807 = torch.aten.reciprocal %2803 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %int8192_2812 = torch.constant.int 8192
    %2808 = torch.aten.mul.Scalar %2807, %int8192_2812 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %int1_2813 = torch.constant.int 1
    %int1_2814 = torch.constant.int 1
    %2809 = torch.aten.sub.Scalar %2808, %int1_2813, %int1_2814 : !torch.vtensor<[64],f32>, !torch.int, !torch.int -> !torch.vtensor<[64],f32>
    %int3_2815 = torch.constant.int 3
    %2810 = torch.aten.div.Scalar %2809, %int3_2815 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %int1_2816 = torch.constant.int 1
    %int1_2817 = torch.constant.int 1
    %2811 = torch.aten.rsub.Scalar %2810, %int1_2816, %int1_2817 : !torch.vtensor<[64],f32>, !torch.int, !torch.int -> !torch.vtensor<[64],f32>
    %2812 = torch.aten.mul.Tensor %2811, %2806 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %int8_2818 = torch.constant.int 8
    %2813 = torch.aten.div.Scalar %2812, %int8_2818 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %2814 = torch.aten.mul.Tensor %2810, %2806 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %int1_2819 = torch.constant.int 1
    %2815 = torch.aten.add.Tensor %2813, %2814, %int1_2819 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %float2.048000e03_2820 = torch.constant.float 2.048000e+03
    %2816 = torch.aten.lt.Scalar %2803, %float2.048000e03_2820 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1>
    %2817 = torch.aten.bitwise_not %2816 : !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1>
    %float8.192000e03_2821 = torch.constant.float 8.192000e+03
    %2818 = torch.aten.gt.Scalar %2803, %float8.192000e03_2821 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1>
    %2819 = torch.aten.bitwise_not %2818 : !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1>
    %2820 = torch.aten.mul.Tensor %2817, %2819 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1>
    %2821 = torch.aten.where.self %2820, %2815, %2806 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %2822 = torch.prim.ListConstruct %2821, %2821 : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor>
    %int-1_2822 = torch.constant.int -1
    %2823 = torch.aten.cat %2822, %int-1_2822 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[128],f32>
    %int6_2823 = torch.constant.int 6
    %2824 = torch.prims.convert_element_type %2795, %int6_2823 : !torch.vtensor<[131072],si64>, !torch.int -> !torch.vtensor<[131072],f32>
    %int131072_2824 = torch.constant.int 131072
    %int1_2825 = torch.constant.int 1
    %2825 = torch.prim.ListConstruct %int131072_2824, %int1_2825 : (!torch.int, !torch.int) -> !torch.list<int>
    %2826 = torch.aten.view %2824, %2825 : !torch.vtensor<[131072],f32>, !torch.list<int> -> !torch.vtensor<[131072,1],f32>
    %2827 = torch.aten.mul.Tensor %2826, %2823 : !torch.vtensor<[131072,1],f32>, !torch.vtensor<[128],f32> -> !torch.vtensor<[131072,128],f32>
    %2828 = torch.aten.cos %2827 : !torch.vtensor<[131072,128],f32> -> !torch.vtensor<[131072,128],f32>
    %int15_2826 = torch.constant.int 15
    %2829 = torch.prims.convert_element_type %2828, %int15_2826 : !torch.vtensor<[131072,128],f32>, !torch.int -> !torch.vtensor<[131072,128],bf16>
    %2830 = torch.aten.sin %2827 : !torch.vtensor<[131072,128],f32> -> !torch.vtensor<[131072,128],f32>
    %int15_2827 = torch.constant.int 15
    %2831 = torch.prims.convert_element_type %2830, %int15_2827 : !torch.vtensor<[131072,128],f32>, !torch.int -> !torch.vtensor<[131072,128],bf16>
    %int1_2828 = torch.constant.int 1
    %2832 = torch.aten.size.int %2709, %int1_2828 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.int
    %int0_2829 = torch.constant.int 0
    %2833 = torch.aten.add.int %int0_2829, %2832 : !torch.int, !torch.int -> !torch.int
    %int0_2830 = torch.constant.int 0
    %int0_2831 = torch.constant.int 0
    %int1_2832 = torch.constant.int 1
    %2834 = torch.aten.slice.Tensor %2829, %int0_2830, %int0_2831, %2833, %int1_2832 : !torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %2834, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int1_2833 = torch.constant.int 1
    %int0_2834 = torch.constant.int 0
    %int9223372036854775807_2835 = torch.constant.int 9223372036854775807
    %int1_2836 = torch.constant.int 1
    %2835 = torch.aten.slice.Tensor %2834, %int1_2833, %int0_2834, %int9223372036854775807_2835, %int1_2836 : !torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %2835, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int0_2837 = torch.constant.int 0
    %2836 = torch.aten.add.int %int0_2837, %2832 : !torch.int, !torch.int -> !torch.int
    %int0_2838 = torch.constant.int 0
    %int0_2839 = torch.constant.int 0
    %int1_2840 = torch.constant.int 1
    %2837 = torch.aten.slice.Tensor %2831, %int0_2838, %int0_2839, %2836, %int1_2840 : !torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %2837, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int1_2841 = torch.constant.int 1
    %int0_2842 = torch.constant.int 0
    %int9223372036854775807_2843 = torch.constant.int 9223372036854775807
    %int1_2844 = torch.constant.int 1
    %2838 = torch.aten.slice.Tensor %2837, %int1_2841, %int0_2842, %int9223372036854775807_2843, %int1_2844 : !torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %2838, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int0_2845 = torch.constant.int 0
    %2839 = torch.aten.unsqueeze %2835, %int0_2845 : !torch.vtensor<[?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %2839, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int1_2846 = torch.constant.int 1
    %int0_2847 = torch.constant.int 0
    %int9223372036854775807_2848 = torch.constant.int 9223372036854775807
    %int1_2849 = torch.constant.int 1
    %2840 = torch.aten.slice.Tensor %2839, %int1_2846, %int0_2847, %int9223372036854775807_2848, %int1_2849 : !torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %2840, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int2_2850 = torch.constant.int 2
    %2841 = torch.aten.unsqueeze %2840, %int2_2850 : !torch.vtensor<[1,?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %2841, [%548], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int3_2851 = torch.constant.int 3
    %int0_2852 = torch.constant.int 0
    %int9223372036854775807_2853 = torch.constant.int 9223372036854775807
    %int1_2854 = torch.constant.int 1
    %2842 = torch.aten.slice.Tensor %2841, %int3_2851, %int0_2852, %int9223372036854775807_2853, %int1_2854 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %2842, [%548], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int0_2855 = torch.constant.int 0
    %2843 = torch.aten.unsqueeze %2838, %int0_2855 : !torch.vtensor<[?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %2843, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int1_2856 = torch.constant.int 1
    %int0_2857 = torch.constant.int 0
    %int9223372036854775807_2858 = torch.constant.int 9223372036854775807
    %int1_2859 = torch.constant.int 1
    %2844 = torch.aten.slice.Tensor %2843, %int1_2856, %int0_2857, %int9223372036854775807_2858, %int1_2859 : !torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %2844, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int2_2860 = torch.constant.int 2
    %2845 = torch.aten.unsqueeze %2844, %int2_2860 : !torch.vtensor<[1,?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %2845, [%548], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int3_2861 = torch.constant.int 3
    %int0_2862 = torch.constant.int 0
    %int9223372036854775807_2863 = torch.constant.int 9223372036854775807
    %int1_2864 = torch.constant.int 1
    %2846 = torch.aten.slice.Tensor %2845, %int3_2861, %int0_2862, %int9223372036854775807_2863, %int1_2864 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %2846, [%548], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int1_2865 = torch.constant.int 1
    %int2_2866 = torch.constant.int 2
    %2847 = torch.aten.transpose.int %2842, %int1_2865, %int2_2866 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,?,128],bf16>
    torch.bind_symbolic_shape %2847, [%548], affine_map<()[s0] -> (1, 1, s0 * 32, 128)> : !torch.vtensor<[1,1,?,128],bf16>
    %int1_2867 = torch.constant.int 1
    %int1_2868 = torch.constant.int 1
    %int1_2869 = torch.constant.int 1
    %int1_2870 = torch.constant.int 1
    %2848 = torch.prim.ListConstruct %int1_2867, %int1_2868, %int1_2869, %int1_2870 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2849 = torch.aten.repeat %2847, %2848 : !torch.vtensor<[1,1,?,128],bf16>, !torch.list<int> -> !torch.vtensor<[1,1,?,128],bf16>
    torch.bind_symbolic_shape %2849, [%548], affine_map<()[s0] -> (1, 1, s0 * 32, 128)> : !torch.vtensor<[1,1,?,128],bf16>
    %int1_2871 = torch.constant.int 1
    %int2_2872 = torch.constant.int 2
    %2850 = torch.aten.transpose.int %2846, %int1_2871, %int2_2872 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,?,128],bf16>
    torch.bind_symbolic_shape %2850, [%548], affine_map<()[s0] -> (1, 1, s0 * 32, 128)> : !torch.vtensor<[1,1,?,128],bf16>
    %int1_2873 = torch.constant.int 1
    %int2_2874 = torch.constant.int 2
    %2851 = torch.aten.transpose.int %2724, %int1_2873, %int2_2874 : !torch.vtensor<[1,?,8,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,8,?,128],bf16>
    torch.bind_symbolic_shape %2851, [%548], affine_map<()[s0] -> (1, 8, s0 * 32, 128)> : !torch.vtensor<[1,8,?,128],bf16>
    %int1_2875 = torch.constant.int 1
    %int1_2876 = torch.constant.int 1
    %int1_2877 = torch.constant.int 1
    %int1_2878 = torch.constant.int 1
    %2852 = torch.prim.ListConstruct %int1_2875, %int1_2876, %int1_2877, %int1_2878 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2853 = torch.aten.repeat %2850, %2852 : !torch.vtensor<[1,1,?,128],bf16>, !torch.list<int> -> !torch.vtensor<[1,1,?,128],bf16>
    torch.bind_symbolic_shape %2853, [%548], affine_map<()[s0] -> (1, 1, s0 * 32, 128)> : !torch.vtensor<[1,1,?,128],bf16>
    %2854 = torch.aten.mul.Tensor %2851, %2849 : !torch.vtensor<[1,8,?,128],bf16>, !torch.vtensor<[1,1,?,128],bf16> -> !torch.vtensor<[1,8,?,128],bf16>
    torch.bind_symbolic_shape %2854, [%548], affine_map<()[s0] -> (1, 8, s0 * 32, 128)> : !torch.vtensor<[1,8,?,128],bf16>
    %int3_2879 = torch.constant.int 3
    %int0_2880 = torch.constant.int 0
    %int64_2881 = torch.constant.int 64
    %int1_2882 = torch.constant.int 1
    %2855 = torch.aten.slice.Tensor %2851, %int3_2879, %int0_2880, %int64_2881, %int1_2882 : !torch.vtensor<[1,8,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,8,?,64],bf16>
    torch.bind_symbolic_shape %2855, [%548], affine_map<()[s0] -> (1, 8, s0 * 32, 64)> : !torch.vtensor<[1,8,?,64],bf16>
    %int3_2883 = torch.constant.int 3
    %int64_2884 = torch.constant.int 64
    %int9223372036854775807_2885 = torch.constant.int 9223372036854775807
    %int1_2886 = torch.constant.int 1
    %2856 = torch.aten.slice.Tensor %2851, %int3_2883, %int64_2884, %int9223372036854775807_2885, %int1_2886 : !torch.vtensor<[1,8,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,8,?,64],bf16>
    torch.bind_symbolic_shape %2856, [%548], affine_map<()[s0] -> (1, 8, s0 * 32, 64)> : !torch.vtensor<[1,8,?,64],bf16>
    %2857 = torch.aten.neg %2856 : !torch.vtensor<[1,8,?,64],bf16> -> !torch.vtensor<[1,8,?,64],bf16>
    torch.bind_symbolic_shape %2857, [%548], affine_map<()[s0] -> (1, 8, s0 * 32, 64)> : !torch.vtensor<[1,8,?,64],bf16>
    %2858 = torch.prim.ListConstruct %2857, %2855 : (!torch.vtensor<[1,8,?,64],bf16>, !torch.vtensor<[1,8,?,64],bf16>) -> !torch.list<vtensor>
    %int-1_2887 = torch.constant.int -1
    %2859 = torch.aten.cat %2858, %int-1_2887 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,8,?,128],bf16>
    torch.bind_symbolic_shape %2859, [%548], affine_map<()[s0] -> (1, 8, s0 * 32, 128)> : !torch.vtensor<[1,8,?,128],bf16>
    %2860 = torch.aten.mul.Tensor %2859, %2853 : !torch.vtensor<[1,8,?,128],bf16>, !torch.vtensor<[1,1,?,128],bf16> -> !torch.vtensor<[1,8,?,128],bf16>
    torch.bind_symbolic_shape %2860, [%548], affine_map<()[s0] -> (1, 8, s0 * 32, 128)> : !torch.vtensor<[1,8,?,128],bf16>
    %int1_2888 = torch.constant.int 1
    %2861 = torch.aten.add.Tensor %2854, %2860, %int1_2888 : !torch.vtensor<[1,8,?,128],bf16>, !torch.vtensor<[1,8,?,128],bf16>, !torch.int -> !torch.vtensor<[1,8,?,128],bf16>
    torch.bind_symbolic_shape %2861, [%548], affine_map<()[s0] -> (1, 8, s0 * 32, 128)> : !torch.vtensor<[1,8,?,128],bf16>
    %int1_2889 = torch.constant.int 1
    %int2_2890 = torch.constant.int 2
    %2862 = torch.aten.transpose.int %2861, %int1_2889, %int2_2890 : !torch.vtensor<[1,8,?,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %2862, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %2863 = torch.aten.div.Tensor %2862, %127 : !torch.vtensor<[1,?,8,128],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %2863, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %float-2.400000e02_2891 = torch.constant.float -2.400000e+02
    %float2.400000e02_2892 = torch.constant.float 2.400000e+02
    %2864 = torch.aten.clamp %2863, %float-2.400000e02_2891, %float2.400000e02_2892 : !torch.vtensor<[1,?,8,128],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %2864, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %int26_2893 = torch.constant.int 26
    %2865 = torch.prims.convert_element_type %2864, %int26_2893 : !torch.vtensor<[1,?,8,128],bf16>, !torch.int -> !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2865, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
    %2866 = torch.aten.div.Tensor %2726, %127 : !torch.vtensor<[1,?,8,128],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %2866, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %float-2.400000e02_2894 = torch.constant.float -2.400000e+02
    %float2.400000e02_2895 = torch.constant.float 2.400000e+02
    %2867 = torch.aten.clamp %2866, %float-2.400000e02_2894, %float2.400000e02_2895 : !torch.vtensor<[1,?,8,128],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %2867, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %int26_2896 = torch.constant.int 26
    %2868 = torch.prims.convert_element_type %2867, %int26_2896 : !torch.vtensor<[1,?,8,128],bf16>, !torch.int -> !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2868, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
    %int64_2897 = torch.constant.int 64
    %2869 = torch.aten.mul.Scalar %arg2, %int64_2897 : !torch.vtensor<[1,?],si64>, !torch.int -> !torch.vtensor<[1,?],si64>
    torch.bind_symbolic_shape %2869, [%548], affine_map<()[s0] -> (1, s0)> : !torch.vtensor<[1,?],si64>
    %int14 = torch.constant.int 14
    %int1_2898 = torch.constant.int 1
    %2870 = torch.aten.add.Scalar %2869, %int14, %int1_2898 : !torch.vtensor<[1,?],si64>, !torch.int, !torch.int -> !torch.vtensor<[1,?],si64>
    torch.bind_symbolic_shape %2870, [%548], affine_map<()[s0] -> (1, s0)> : !torch.vtensor<[1,?],si64>
    %int1_2899 = torch.constant.int 1
    %int32_2900 = torch.constant.int 32
    %int8_2901 = torch.constant.int 8
    %int128_2902 = torch.constant.int 128
    %2871 = torch.prim.ListConstruct %int1_2899, %748, %int32_2900, %int8_2901, %int128_2902 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2872 = torch.aten.view %2865, %2871 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2872, [%548], affine_map<()[s0] -> (1, s0, 32, 8, 128)> : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
    %int32_2903 = torch.constant.int 32
    %int8_2904 = torch.constant.int 8
    %int128_2905 = torch.constant.int 128
    %2873 = torch.prim.ListConstruct %748, %int32_2903, %int8_2904, %int128_2905 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2874 = torch.aten.view %2872, %2873 : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2874, [%548], affine_map<()[s0] -> (s0, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %2875 = torch.prim.ListConstruct %748 : (!torch.int) -> !torch.list<int>
    %2876 = torch.aten.view %2870, %2875 : !torch.vtensor<[1,?],si64>, !torch.list<int> -> !torch.vtensor<[?],si64>
    torch.bind_symbolic_shape %2876, [%548], affine_map<()[s0] -> (s0)> : !torch.vtensor<[?],si64>
    %int32_2906 = torch.constant.int 32
    %int2_2907 = torch.constant.int 2
    %int32_2908 = torch.constant.int 32
    %int8_2909 = torch.constant.int 8
    %int128_2910 = torch.constant.int 128
    %2877 = torch.prim.ListConstruct %739, %int32_2906, %int2_2907, %int32_2908, %int8_2909, %int128_2910 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2878 = torch.aten.view %2601, %2877 : !torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2878, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    %int32_2911 = torch.constant.int 32
    %2879 = torch.aten.mul.int %739, %int32_2911 : !torch.int, !torch.int -> !torch.int
    %int2_2912 = torch.constant.int 2
    %2880 = torch.aten.mul.int %2879, %int2_2912 : !torch.int, !torch.int -> !torch.int
    %int32_2913 = torch.constant.int 32
    %int8_2914 = torch.constant.int 8
    %int128_2915 = torch.constant.int 128
    %2881 = torch.prim.ListConstruct %2880, %int32_2913, %int8_2914, %int128_2915 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2882 = torch.aten.view %2878, %2881 : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2882, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %2883 = torch.prim.ListConstruct %2876 : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %false_2916 = torch.constant.bool false
    %2884 = torch.aten.index_put %2882, %2883, %2874, %false_2916 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2884, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %int32_2917 = torch.constant.int 32
    %int2_2918 = torch.constant.int 2
    %int32_2919 = torch.constant.int 32
    %int8_2920 = torch.constant.int 8
    %int128_2921 = torch.constant.int 128
    %2885 = torch.prim.ListConstruct %739, %int32_2917, %int2_2918, %int32_2919, %int8_2920, %int128_2921 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2886 = torch.aten.view %2884, %2885 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2886, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    %int2097152_2922 = torch.constant.int 2097152
    %2887 = torch.prim.ListConstruct %739, %int2097152_2922 : (!torch.int, !torch.int) -> !torch.list<int>
    %2888 = torch.aten.view %2886, %2887 : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2888, [%549], affine_map<()[s0] -> (s0, 2097152)> : !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    %int32_2923 = torch.constant.int 32
    %int2_2924 = torch.constant.int 2
    %int32_2925 = torch.constant.int 32
    %int8_2926 = torch.constant.int 8
    %int128_2927 = torch.constant.int 128
    %2889 = torch.prim.ListConstruct %739, %int32_2923, %int2_2924, %int32_2925, %int8_2926, %int128_2927 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2890 = torch.aten.view %2888, %2889 : !torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2890, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    %int32_2928 = torch.constant.int 32
    %int8_2929 = torch.constant.int 8
    %int128_2930 = torch.constant.int 128
    %2891 = torch.prim.ListConstruct %2880, %int32_2928, %int8_2929, %int128_2930 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2892 = torch.aten.view %2890, %2891 : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2892, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %int1_2931 = torch.constant.int 1
    %int32_2932 = torch.constant.int 32
    %int8_2933 = torch.constant.int 8
    %int128_2934 = torch.constant.int 128
    %2893 = torch.prim.ListConstruct %int1_2931, %748, %int32_2932, %int8_2933, %int128_2934 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2894 = torch.aten.view %2868, %2893 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2894, [%548], affine_map<()[s0] -> (1, s0, 32, 8, 128)> : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
    %int32_2935 = torch.constant.int 32
    %int8_2936 = torch.constant.int 8
    %int128_2937 = torch.constant.int 128
    %2895 = torch.prim.ListConstruct %748, %int32_2935, %int8_2936, %int128_2937 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2896 = torch.aten.view %2894, %2895 : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2896, [%548], affine_map<()[s0] -> (s0, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %int1_2938 = torch.constant.int 1
    %int1_2939 = torch.constant.int 1
    %2897 = torch.aten.add.Scalar %2870, %int1_2938, %int1_2939 : !torch.vtensor<[1,?],si64>, !torch.int, !torch.int -> !torch.vtensor<[1,?],si64>
    torch.bind_symbolic_shape %2897, [%548], affine_map<()[s0] -> (1, s0)> : !torch.vtensor<[1,?],si64>
    %2898 = torch.prim.ListConstruct %748 : (!torch.int) -> !torch.list<int>
    %2899 = torch.aten.view %2897, %2898 : !torch.vtensor<[1,?],si64>, !torch.list<int> -> !torch.vtensor<[?],si64>
    torch.bind_symbolic_shape %2899, [%548], affine_map<()[s0] -> (s0)> : !torch.vtensor<[?],si64>
    %2900 = torch.prim.ListConstruct %2899 : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %false_2940 = torch.constant.bool false
    %2901 = torch.aten.index_put %2892, %2900, %2896, %false_2940 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2901, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %int32_2941 = torch.constant.int 32
    %int2_2942 = torch.constant.int 2
    %int32_2943 = torch.constant.int 32
    %int8_2944 = torch.constant.int 8
    %int128_2945 = torch.constant.int 128
    %2902 = torch.prim.ListConstruct %739, %int32_2941, %int2_2942, %int32_2943, %int8_2944, %int128_2945 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2903 = torch.aten.view %2901, %2902 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2903, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    %int2097152_2946 = torch.constant.int 2097152
    %2904 = torch.prim.ListConstruct %739, %int2097152_2946 : (!torch.int, !torch.int) -> !torch.list<int>
    %2905 = torch.aten.view %2903, %2904 : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2905, [%549], affine_map<()[s0] -> (s0, 2097152)> : !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    %int-2_2947 = torch.constant.int -2
    %2906 = torch.aten.unsqueeze %2865, %int-2_2947 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2906, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 1, 128)> : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
    %int1_2948 = torch.constant.int 1
    %int8_2949 = torch.constant.int 8
    %int4_2950 = torch.constant.int 4
    %int128_2951 = torch.constant.int 128
    %2907 = torch.prim.ListConstruct %int1_2948, %2832, %int8_2949, %int4_2950, %int128_2951 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %false_2952 = torch.constant.bool false
    %2908 = torch.aten.expand %2906, %2907, %false_2952 : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2908, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    %int0_2953 = torch.constant.int 0
    %2909 = torch.aten.clone %2908, %int0_2953 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2909, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    %int1_2954 = torch.constant.int 1
    %int32_2955 = torch.constant.int 32
    %int128_2956 = torch.constant.int 128
    %2910 = torch.prim.ListConstruct %int1_2954, %2832, %int32_2955, %int128_2956 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2911 = torch.aten._unsafe_view %2909, %2910 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2911, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
    %int-2_2957 = torch.constant.int -2
    %2912 = torch.aten.unsqueeze %2868, %int-2_2957 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2912, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 1, 128)> : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
    %int1_2958 = torch.constant.int 1
    %2913 = torch.aten.size.int %2719, %int1_2958 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.int
    %int1_2959 = torch.constant.int 1
    %int8_2960 = torch.constant.int 8
    %int4_2961 = torch.constant.int 4
    %int128_2962 = torch.constant.int 128
    %2914 = torch.prim.ListConstruct %int1_2959, %2913, %int8_2960, %int4_2961, %int128_2962 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %false_2963 = torch.constant.bool false
    %2915 = torch.aten.expand %2912, %2914, %false_2963 : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2915, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    %int0_2964 = torch.constant.int 0
    %2916 = torch.aten.clone %2915, %int0_2964 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2916, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
    %int1_2965 = torch.constant.int 1
    %int32_2966 = torch.constant.int 32
    %int128_2967 = torch.constant.int 128
    %2917 = torch.prim.ListConstruct %int1_2965, %2913, %int32_2966, %int128_2967 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2918 = torch.aten._unsafe_view %2916, %2917 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2918, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
    %int6_2968 = torch.constant.int 6
    %2919 = torch.prims.convert_element_type %2911, %int6_2968 : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,32,128],f32>
    torch.bind_symbolic_shape %2919, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
    %2920 = torch.aten.mul.Tensor %2919, %127 : !torch.vtensor<[1,?,32,128],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,32,128],f32>
    torch.bind_symbolic_shape %2920, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
    %int15_2969 = torch.constant.int 15
    %2921 = torch.prims.convert_element_type %2920, %int15_2969 : !torch.vtensor<[1,?,32,128],f32>, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
    torch.bind_symbolic_shape %2921, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
    %int6_2970 = torch.constant.int 6
    %2922 = torch.prims.convert_element_type %2918, %int6_2970 : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,32,128],f32>
    torch.bind_symbolic_shape %2922, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
    %2923 = torch.aten.mul.Tensor %2922, %127 : !torch.vtensor<[1,?,32,128],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,32,128],f32>
    torch.bind_symbolic_shape %2923, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
    %int15_2971 = torch.constant.int 15
    %2924 = torch.prims.convert_element_type %2923, %int15_2971 : !torch.vtensor<[1,?,32,128],f32>, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
    torch.bind_symbolic_shape %2924, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
    %int1_2972 = torch.constant.int 1
    %int2_2973 = torch.constant.int 2
    %2925 = torch.aten.transpose.int %2794, %int1_2972, %int2_2973 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
    torch.bind_symbolic_shape %2925, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %int1_2974 = torch.constant.int 1
    %int2_2975 = torch.constant.int 2
    %2926 = torch.aten.transpose.int %2921, %int1_2974, %int2_2975 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
    torch.bind_symbolic_shape %2926, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %int1_2976 = torch.constant.int 1
    %int2_2977 = torch.constant.int 2
    %2927 = torch.aten.transpose.int %2924, %int1_2976, %int2_2977 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
    torch.bind_symbolic_shape %2927, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %float0.000000e00_2978 = torch.constant.float 0.000000e+00
    %true_2979 = torch.constant.bool true
    %none_2980 = torch.constant.none
    %none_2981 = torch.constant.none
    %2928:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%2925, %2926, %2927, %float0.000000e00_2978, %true_2979, %none_2980, %none_2981) : (!torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,32,?,128],bf16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,32,?],f32>) 
    torch.bind_symbolic_shape %2928#0, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %int1_2982 = torch.constant.int 1
    %int2_2983 = torch.constant.int 2
    %2929 = torch.aten.transpose.int %2928#0, %int1_2982, %int2_2983 : !torch.vtensor<[1,32,?,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
    torch.bind_symbolic_shape %2929, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
    %int1_2984 = torch.constant.int 1
    %int4096_2985 = torch.constant.int 4096
    %2930 = torch.prim.ListConstruct %int1_2984, %2764, %int4096_2985 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2931 = torch.aten.view %2929, %2930 : !torch.vtensor<[1,?,32,128],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2931, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %2932 = torch.aten.div.Tensor %2931, %128 : !torch.vtensor<[1,?,4096],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2932, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %float-2.400000e02_2986 = torch.constant.float -2.400000e+02
    %float2.400000e02_2987 = torch.constant.float 2.400000e+02
    %2933 = torch.aten.clamp %2932, %float-2.400000e02_2986, %float2.400000e02_2987 : !torch.vtensor<[1,?,4096],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2933, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int26_2988 = torch.constant.int 26
    %2934 = torch.prims.convert_element_type %2933, %int26_2988 : !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2934, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_2989 = torch.constant.int -2
    %int-1_2990 = torch.constant.int -1
    %2935 = torch.aten.transpose.int %129, %int-2_2989, %int-1_2990 : !torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %int4096_2991 = torch.constant.int 4096
    %2936 = torch.prim.ListConstruct %2764, %int4096_2991 : (!torch.int, !torch.int) -> !torch.list<int>
    %2937 = torch.aten.view %2934, %2936 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2937, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %2938 = torch.aten.mm %2937, %2935 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2938, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %int1_2992 = torch.constant.int 1
    %int4096_2993 = torch.constant.int 4096
    %2939 = torch.prim.ListConstruct %int1_2992, %2764, %int4096_2993 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2940 = torch.aten.view %2938, %2939 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2940, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int15_2994 = torch.constant.int 15
    %2941 = torch.prims.convert_element_type %2940, %int15_2994 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2941, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int1_2995 = torch.constant.int 1
    %2942 = torch.aten.add.Tensor %2681, %2941, %int1_2995 : !torch.vtensor<[1,?,4096],bf16>, !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2942, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int6_2996 = torch.constant.int 6
    %2943 = torch.prims.convert_element_type %2942, %int6_2996 : !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %2943, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int2_2997 = torch.constant.int 2
    %2944 = torch.aten.pow.Tensor_Scalar %2943, %int2_2997 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %2944, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int-1_2998 = torch.constant.int -1
    %2945 = torch.prim.ListConstruct %int-1_2998 : (!torch.int) -> !torch.list<int>
    %true_2999 = torch.constant.bool true
    %none_3000 = torch.constant.none
    %2946 = torch.aten.mean.dim %2944, %2945, %true_2999, %none_3000 : !torch.vtensor<[1,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %2946, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %float1.000000e-05_3001 = torch.constant.float 1.000000e-05
    %int1_3002 = torch.constant.int 1
    %2947 = torch.aten.add.Scalar %2946, %float1.000000e-05_3001, %int1_3002 : !torch.vtensor<[1,?,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %2947, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %2948 = torch.aten.rsqrt %2947 : !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %2948, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %2949 = torch.aten.mul.Tensor %2943, %2948 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %2949, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int15_3003 = torch.constant.int 15
    %2950 = torch.prims.convert_element_type %2949, %int15_3003 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2950, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %2951 = torch.aten.mul.Tensor %130, %2950 : !torch.vtensor<[4096],bf16>, !torch.vtensor<[1,?,4096],bf16> -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2951, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %2952 = torch.aten.div.Tensor %2951, %131 : !torch.vtensor<[1,?,4096],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2952, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %float-2.400000e02_3004 = torch.constant.float -2.400000e+02
    %float2.400000e02_3005 = torch.constant.float 2.400000e+02
    %2953 = torch.aten.clamp %2952, %float-2.400000e02_3004, %float2.400000e02_3005 : !torch.vtensor<[1,?,4096],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2953, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int26_3006 = torch.constant.int 26
    %2954 = torch.prims.convert_element_type %2953, %int26_3006 : !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2954, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_3007 = torch.constant.int -2
    %int-1_3008 = torch.constant.int -1
    %2955 = torch.aten.transpose.int %132, %int-2_3007, %int-1_3008 : !torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %int4096_3009 = torch.constant.int 4096
    %2956 = torch.prim.ListConstruct %564, %int4096_3009 : (!torch.int, !torch.int) -> !torch.list<int>
    %2957 = torch.aten.view %2954, %2956 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2957, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %2958 = torch.aten.mm %2957, %2955 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,14336],f8E4M3FNUZ> -> !torch.vtensor<[?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2958, [%548], affine_map<()[s0] -> (s0 * 32, 14336)> : !torch.vtensor<[?,14336],f8E4M3FNUZ>
    %int1_3010 = torch.constant.int 1
    %int14336_3011 = torch.constant.int 14336
    %2959 = torch.prim.ListConstruct %int1_3010, %564, %int14336_3011 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2960 = torch.aten.view %2958, %2959 : !torch.vtensor<[?,14336],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2960, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    %int15_3012 = torch.constant.int 15
    %2961 = torch.prims.convert_element_type %2960, %int15_3012 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %2961, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %2962 = torch.aten.silu %2961 : !torch.vtensor<[1,?,14336],bf16> -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %2962, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %2963 = torch.aten.div.Tensor %2951, %133 : !torch.vtensor<[1,?,4096],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2963, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %float-2.400000e02_3013 = torch.constant.float -2.400000e+02
    %float2.400000e02_3014 = torch.constant.float 2.400000e+02
    %2964 = torch.aten.clamp %2963, %float-2.400000e02_3013, %float2.400000e02_3014 : !torch.vtensor<[1,?,4096],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2964, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int26_3015 = torch.constant.int 26
    %2965 = torch.prims.convert_element_type %2964, %int26_3015 : !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2965, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_3016 = torch.constant.int -2
    %int-1_3017 = torch.constant.int -1
    %2966 = torch.aten.transpose.int %134, %int-2_3016, %int-1_3017 : !torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
    %int4096_3018 = torch.constant.int 4096
    %2967 = torch.prim.ListConstruct %564, %int4096_3018 : (!torch.int, !torch.int) -> !torch.list<int>
    %2968 = torch.aten.view %2965, %2967 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2968, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %2969 = torch.aten.mm %2968, %2966 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,14336],f8E4M3FNUZ> -> !torch.vtensor<[?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2969, [%548], affine_map<()[s0] -> (s0 * 32, 14336)> : !torch.vtensor<[?,14336],f8E4M3FNUZ>
    %int1_3019 = torch.constant.int 1
    %int14336_3020 = torch.constant.int 14336
    %2970 = torch.prim.ListConstruct %int1_3019, %564, %int14336_3020 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2971 = torch.aten.view %2969, %2970 : !torch.vtensor<[?,14336],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2971, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    %int15_3021 = torch.constant.int 15
    %2972 = torch.prims.convert_element_type %2971, %int15_3021 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %2972, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %2973 = torch.aten.mul.Tensor %2962, %2972 : !torch.vtensor<[1,?,14336],bf16>, !torch.vtensor<[1,?,14336],bf16> -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %2973, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %2974 = torch.aten.div.Tensor %2973, %135 : !torch.vtensor<[1,?,14336],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %2974, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %float-2.400000e02_3022 = torch.constant.float -2.400000e+02
    %float2.400000e02_3023 = torch.constant.float 2.400000e+02
    %2975 = torch.aten.clamp %2974, %float-2.400000e02_3022, %float2.400000e02_3023 : !torch.vtensor<[1,?,14336],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,14336],bf16>
    torch.bind_symbolic_shape %2975, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
    %int26_3024 = torch.constant.int 26
    %2976 = torch.prims.convert_element_type %2975, %int26_3024 : !torch.vtensor<[1,?,14336],bf16>, !torch.int -> !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2976, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
    %int-2_3025 = torch.constant.int -2
    %int-1_3026 = torch.constant.int -1
    %2977 = torch.aten.transpose.int %136, %int-2_3025, %int-1_3026 : !torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
    %int1_3027 = torch.constant.int 1
    %2978 = torch.aten.size.int %2960, %int1_3027 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.int -> !torch.int
    %int14336_3028 = torch.constant.int 14336
    %2979 = torch.prim.ListConstruct %2978, %int14336_3028 : (!torch.int, !torch.int) -> !torch.list<int>
    %2980 = torch.aten.view %2976, %2979 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,14336],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2980, [%548], affine_map<()[s0] -> (s0 * 32, 14336)> : !torch.vtensor<[?,14336],f8E4M3FNUZ>
    %2981 = torch.aten.mm %2980, %2977 : !torch.vtensor<[?,14336],f8E4M3FNUZ>, !torch.vtensor<[14336,4096],f8E4M3FNUZ> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2981, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %int1_3029 = torch.constant.int 1
    %int4096_3030 = torch.constant.int 4096
    %2982 = torch.prim.ListConstruct %int1_3029, %2978, %int4096_3030 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2983 = torch.aten.view %2981, %2982 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2983, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int15_3031 = torch.constant.int 15
    %2984 = torch.prims.convert_element_type %2983, %int15_3031 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2984, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int1_3032 = torch.constant.int 1
    %2985 = torch.aten.add.Tensor %2942, %2984, %int1_3032 : !torch.vtensor<[1,?,4096],bf16>, !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2985, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int6_3033 = torch.constant.int 6
    %2986 = torch.prims.convert_element_type %2985, %int6_3033 : !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %2986, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int2_3034 = torch.constant.int 2
    %2987 = torch.aten.pow.Tensor_Scalar %2986, %int2_3034 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %2987, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int-1_3035 = torch.constant.int -1
    %2988 = torch.prim.ListConstruct %int-1_3035 : (!torch.int) -> !torch.list<int>
    %true_3036 = torch.constant.bool true
    %none_3037 = torch.constant.none
    %2989 = torch.aten.mean.dim %2987, %2988, %true_3036, %none_3037 : !torch.vtensor<[1,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %2989, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %float1.000000e-05_3038 = torch.constant.float 1.000000e-05
    %int1_3039 = torch.constant.int 1
    %2990 = torch.aten.add.Scalar %2989, %float1.000000e-05_3038, %int1_3039 : !torch.vtensor<[1,?,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %2990, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %2991 = torch.aten.rsqrt %2990 : !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,1],f32>
    torch.bind_symbolic_shape %2991, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
    %2992 = torch.aten.mul.Tensor %2986, %2991 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,4096],f32>
    torch.bind_symbolic_shape %2992, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
    %int15_3040 = torch.constant.int 15
    %2993 = torch.prims.convert_element_type %2992, %int15_3040 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2993, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %2994 = torch.aten.mul.Tensor %137, %2993 : !torch.vtensor<[4096],bf16>, !torch.vtensor<[1,?,4096],bf16> -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2994, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %2995 = torch.aten.div.Tensor %2994, %138 : !torch.vtensor<[1,?,4096],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2995, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %float-2.400000e02_3041 = torch.constant.float -2.400000e+02
    %float2.400000e02_3042 = torch.constant.float 2.400000e+02
    %2996 = torch.aten.clamp %2995, %float-2.400000e02_3041, %float2.400000e02_3042 : !torch.vtensor<[1,?,4096],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %2996, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int26_3043 = torch.constant.int 26
    %2997 = torch.prims.convert_element_type %2996, %int26_3043 : !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %2997, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_3044 = torch.constant.int -2
    %int-1_3045 = torch.constant.int -1
    %2998 = torch.aten.transpose.int %139, %int-2_3044, %int-1_3045 : !torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
    %int4096_3046 = torch.constant.int 4096
    %2999 = torch.prim.ListConstruct %564, %int4096_3046 : (!torch.int, !torch.int) -> !torch.list<int>
    %3000 = torch.aten.view %2997, %2999 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %3000, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %3001 = torch.aten.mm %3000, %2998 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %3001, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %int1_3047 = torch.constant.int 1
    %int4096_3048 = torch.constant.int 4096
    %3002 = torch.prim.ListConstruct %int1_3047, %564, %int4096_3048 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3003 = torch.aten.view %3001, %3002 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %3003, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int15_3049 = torch.constant.int 15
    %3004 = torch.prims.convert_element_type %3003, %int15_3049 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %3004, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %3005 = torch.aten.div.Tensor %2994, %140 : !torch.vtensor<[1,?,4096],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %3005, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %float-2.400000e02_3050 = torch.constant.float -2.400000e+02
    %float2.400000e02_3051 = torch.constant.float 2.400000e+02
    %3006 = torch.aten.clamp %3005, %float-2.400000e02_3050, %float2.400000e02_3051 : !torch.vtensor<[1,?,4096],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %3006, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int26_3052 = torch.constant.int 26
    %3007 = torch.prims.convert_element_type %3006, %int26_3052 : !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %3007, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_3053 = torch.constant.int -2
    %int-1_3054 = torch.constant.int -1
    %3008 = torch.aten.transpose.int %141, %int-2_3053, %int-1_3054 : !torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,1024],f8E4M3FNUZ>
    %int4096_3055 = torch.constant.int 4096
    %3009 = torch.prim.ListConstruct %564, %int4096_3055 : (!torch.int, !torch.int) -> !torch.list<int>
    %3010 = torch.aten.view %3007, %3009 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %3010, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %3011 = torch.aten.mm %3010, %3008 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,1024],f8E4M3FNUZ> -> !torch.vtensor<[?,1024],f8E4M3FNUZ>
    torch.bind_symbolic_shape %3011, [%548], affine_map<()[s0] -> (s0 * 32, 1024)> : !torch.vtensor<[?,1024],f8E4M3FNUZ>
    %int1_3056 = torch.constant.int 1
    %int1024_3057 = torch.constant.int 1024
    %3012 = torch.prim.ListConstruct %int1_3056, %564, %int1024_3057 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3013 = torch.aten.view %3011, %3012 : !torch.vtensor<[?,1024],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
    torch.bind_symbolic_shape %3013, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
    %int15_3058 = torch.constant.int 15
    %3014 = torch.prims.convert_element_type %3013, %int15_3058 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,1024],bf16>
    torch.bind_symbolic_shape %3014, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],bf16>
    %3015 = torch.aten.div.Tensor %2994, %142 : !torch.vtensor<[1,?,4096],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %3015, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %float-2.400000e02_3059 = torch.constant.float -2.400000e+02
    %float2.400000e02_3060 = torch.constant.float 2.400000e+02
    %3016 = torch.aten.clamp %3015, %float-2.400000e02_3059, %float2.400000e02_3060 : !torch.vtensor<[1,?,4096],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],bf16>
    torch.bind_symbolic_shape %3016, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
    %int26_3061 = torch.constant.int 26
    %3017 = torch.prims.convert_element_type %3016, %int26_3061 : !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %3017, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
    %int-2_3062 = torch.constant.int -2
    %int-1_3063 = torch.constant.int -1
    %3018 = torch.aten.transpose.int %143, %int-2_3062, %int-1_3063 : !torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,1024],f8E4M3FNUZ>
    %int4096_3064 = torch.constant.int 4096
    %3019 = torch.prim.ListConstruct %564, %int4096_3064 : (!torch.int, !torch.int) -> !torch.list<int>
    %3020 = torch.aten.view %3017, %3019 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
    torch.bind_symbolic_shape %3020, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
    %3021 = torch.aten.mm %3020, %3018 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,1024],f8E4M3FNUZ> -> !torch.vtensor<[?,1024],f8E4M3FNUZ>
    torch.bind_symbolic_shape %3021, [%548], affine_map<()[s0] -> (s0 * 32, 1024)> : !torch.vtensor<[?,1024],f8E4M3FNUZ>
    %int1_3065 = torch.constant.int 1
    %int1024_3066 = torch.constant.int 1024
    %3022 = torch.prim.ListConstruct %int1_3065, %564, %int1024_3066 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3023 = torch.aten.view %3021, %3022 : !torch.vtensor<[?,1024],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
    torch.bind_symbolic_shape %3023, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
    %int15_3067 = torch.constant.int 15
    %3024 = torch.prims.convert_element_type %3023, %int15_3067 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,1024],bf16>
    torch.bind_symbolic_shape %3024, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],bf16>
    %int1_3068 = torch.constant.int 1
    %int32_3069 = torch.constant.int 32
    %int128_3070 = torch.constant.int 128
    %3025 = torch.prim.ListConstruct %int1_3068, %564, %int32_3069, %int128_3070 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3026 = torch.aten.view %3004, %3025 : !torch.vtensor<[1,?,4096],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,32,128],bf16>
    torch.bind_symbolic_shape %3026, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
    %int1_3071 = torch.constant.int 1
    %int8_3072 = torch.constant.int 8
    %int128_3073 = torch.constant.int 128
    %3027 = torch.prim.ListConstruct %int1_3071, %564, %int8_3072, %int128_3073 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3028 = torch.aten.view %3014, %3027 : !torch.vtensor<[1,?,1024],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %3028, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %int1_3074 = torch.constant.int 1
    %int8_3075 = torch.constant.int 8
    %int128_3076 = torch.constant.int 128
    %3029 = torch.prim.ListConstruct %int1_3074, %564, %int8_3075, %int128_3076 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3030 = torch.aten.view %3024, %3029 : !torch.vtensor<[1,?,1024],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %3030, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %int131072_3077 = torch.constant.int 131072
    %none_3078 = torch.constant.none
    %none_3079 = torch.constant.none
    %cpu_3080 = torch.constant.device "cpu"
    %false_3081 = torch.constant.bool false
    %3031 = torch.aten.arange %int131072_3077, %none_3078, %none_3079, %cpu_3080, %false_3081 : !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[131072],si64>
    %int0_3082 = torch.constant.int 0
    %int128_3083 = torch.constant.int 128
    %int2_3084 = torch.constant.int 2
    %int4_3085 = torch.constant.int 4
    %none_3086 = torch.constant.none
    %cpu_3087 = torch.constant.device "cpu"
    %false_3088 = torch.constant.bool false
    %3032 = torch.aten.arange.start_step %int0_3082, %int128_3083, %int2_3084, %int4_3085, %none_3086, %cpu_3087, %false_3088 : !torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[64],si64>
    %int6_3089 = torch.constant.int 6
    %3033 = torch.prims.convert_element_type %3032, %int6_3089 : !torch.vtensor<[64],si64>, !torch.int -> !torch.vtensor<[64],f32>
    %int128_3090 = torch.constant.int 128
    %3034 = torch.aten.div.Scalar %3033, %int128_3090 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %float5.000000e05_3091 = torch.constant.float 5.000000e+05
    %3035 = torch.aten.pow.Scalar %float5.000000e05_3091, %3034 : !torch.float, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %3036 = torch.aten.reciprocal %3035 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %float1.000000e00_3092 = torch.constant.float 1.000000e+00
    %3037 = torch.aten.mul.Scalar %3036, %float1.000000e00_3092 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],f32>
    %3038 = torch.aten.reciprocal %3037 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %float6.283190e00_3093 = torch.constant.float 6.2831853071795862
    %3039 = torch.aten.mul.Scalar %3038, %float6.283190e00_3093 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],f32>
    %float8.192000e03_3094 = torch.constant.float 8.192000e+03
    %3040 = torch.aten.gt.Scalar %3039, %float8.192000e03_3094 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1>
    %int8_3095 = torch.constant.int 8
    %3041 = torch.aten.div.Scalar %3037, %int8_3095 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %3042 = torch.aten.where.self %3040, %3041, %3037 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %3043 = torch.aten.reciprocal %3039 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %int8192_3096 = torch.constant.int 8192
    %3044 = torch.aten.mul.Scalar %3043, %int8192_3096 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %int1_3097 = torch.constant.int 1
    %int1_3098 = torch.constant.int 1
    %3045 = torch.aten.sub.Scalar %3044, %int1_3097, %int1_3098 : !torch.vtensor<[64],f32>, !torch.int, !torch.int -> !torch.vtensor<[64],f32>
    %int3_3099 = torch.constant.int 3
    %3046 = torch.aten.div.Scalar %3045, %int3_3099 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %int1_3100 = torch.constant.int 1
    %int1_3101 = torch.constant.int 1
    %3047 = torch.aten.rsub.Scalar %3046, %int1_3100, %int1_3101 : !torch.vtensor<[64],f32>, !torch.int, !torch.int -> !torch.vtensor<[64],f32>
    %3048 = torch.aten.mul.Tensor %3047, %3042 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %int8_3102 = torch.constant.int 8
    %3049 = torch.aten.div.Scalar %3048, %int8_3102 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %3050 = torch.aten.mul.Tensor %3046, %3042 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %int1_3103 = torch.constant.int 1
    %3051 = torch.aten.add.Tensor %3049, %3050, %int1_3103 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %float2.048000e03_3104 = torch.constant.float 2.048000e+03
    %3052 = torch.aten.lt.Scalar %3039, %float2.048000e03_3104 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1>
    %3053 = torch.aten.bitwise_not %3052 : !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1>
    %float8.192000e03_3105 = torch.constant.float 8.192000e+03
    %3054 = torch.aten.gt.Scalar %3039, %float8.192000e03_3105 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1>
    %3055 = torch.aten.bitwise_not %3054 : !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1>
    %3056 = torch.aten.mul.Tensor %3053, %3055 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1>
    %3057 = torch.aten.where.self %3056, %3051, %3042 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %3058 = torch.prim.ListConstruct %3057, %3057 : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor>
    %int-1_3106 = torch.constant.int -1
    %3059 = torch.aten.cat %3058, %int-1_3106 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[128],f32>
    %int6_3107 = torch.constant.int 6
    %3060 = torch.prims.convert_element_type %3031, %int6_3107 : !torch.vtensor<[131072],si64>, !torch.int -> !torch.vtensor<[131072],f32>
    %int131072_3108 = torch.constant.int 131072
    %int1_3109 = torch.constant.int 1
    %3061 = torch.prim.ListConstruct %int131072_3108, %int1_3109 : (!torch.int, !torch.int) -> !torch.list<int>
    %3062 = torch.aten.view %3060, %3061 : !torch.vtensor<[131072],f32>, !torch.list<int> -> !torch.vtensor<[131072,1],f32>
    %3063 = torch.aten.mul.Tensor %3062, %3059 : !torch.vtensor<[131072,1],f32>, !torch.vtensor<[128],f32> -> !torch.vtensor<[131072,128],f32>
    %3064 = torch.aten.cos %3063 : !torch.vtensor<[131072,128],f32> -> !torch.vtensor<[131072,128],f32>
    %int15_3110 = torch.constant.int 15
    %3065 = torch.prims.convert_element_type %3064, %int15_3110 : !torch.vtensor<[131072,128],f32>, !torch.int -> !torch.vtensor<[131072,128],bf16>
    %3066 = torch.aten.sin %3063 : !torch.vtensor<[131072,128],f32> -> !torch.vtensor<[131072,128],f32>
    %int15_3111 = torch.constant.int 15
    %3067 = torch.prims.convert_element_type %3066, %int15_3111 : !torch.vtensor<[131072,128],f32>, !torch.int -> !torch.vtensor<[131072,128],bf16>
    %int1_3112 = torch.constant.int 1
    %3068 = torch.aten.size.int %3003, %int1_3112 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.int
    %int0_3113 = torch.constant.int 0
    %3069 = torch.aten.add.int %int0_3113, %3068 : !torch.int, !torch.int -> !torch.int
    %int0_3114 = torch.constant.int 0
    %int0_3115 = torch.constant.int 0
    %int1_3116 = torch.constant.int 1
    %3070 = torch.aten.slice.Tensor %3065, %int0_3114, %int0_3115, %3069, %int1_3116 : !torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %3070, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int1_3117 = torch.constant.int 1
    %int0_3118 = torch.constant.int 0
    %int9223372036854775807_3119 = torch.constant.int 9223372036854775807
    %int1_3120 = torch.constant.int 1
    %3071 = torch.aten.slice.Tensor %3070, %int1_3117, %int0_3118, %int9223372036854775807_3119, %int1_3120 : !torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %3071, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int0_3121 = torch.constant.int 0
    %3072 = torch.aten.add.int %int0_3121, %3068 : !torch.int, !torch.int -> !torch.int
    %int0_3122 = torch.constant.int 0
    %int0_3123 = torch.constant.int 0
    %int1_3124 = torch.constant.int 1
    %3073 = torch.aten.slice.Tensor %3067, %int0_3122, %int0_3123, %3072, %int1_3124 : !torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %3073, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int1_3125 = torch.constant.int 1
    %int0_3126 = torch.constant.int 0
    %int9223372036854775807_3127 = torch.constant.int 9223372036854775807
    %int1_3128 = torch.constant.int 1
    %3074 = torch.aten.slice.Tensor %3073, %int1_3125, %int0_3126, %int9223372036854775807_3127, %int1_3128 : !torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %3074, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int0_3129 = torch.constant.int 0
    %3075 = torch.aten.unsqueeze %3071, %int0_3129 : !torch.vtensor<[?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %3075, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int1_3130 = torch.constant.int 1
    %int0_3131 = torch.constant.int 0
    %int9223372036854775807_3132 = torch.constant.int 9223372036854775807
    %int1_3133 = torch.constant.int 1
    %3076 = torch.aten.slice.Tensor %3075, %int1_3130, %int0_3131, %int9223372036854775807_3132, %int1_3133 : !torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %3076, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int2_3134 = torch.constant.int 2
    %3077 = torch.aten.unsqueeze %3076, %int2_3134 : !torch.vtensor<[1,?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %3077, [%548], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int3_3135 = torch.constant.int 3
    %int0_3136 = torch.constant.int 0
    %int9223372036854775807_3137 = torch.constant.int 9223372036854775807
    %int1_3138 = torch.constant.int 1
    %3078 = torch.aten.slice.Tensor %3077, %int3_3135, %int0_3136, %int9223372036854775807_3137, %int1_3138 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %3078, [%548], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int0_3139 = torch.constant.int 0
    %3079 = torch.aten.unsqueeze %3074, %int0_3139 : !torch.vtensor<[?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %3079, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int1_3140 = torch.constant.int 1
    %int0_3141 = torch.constant.int 0
    %int9223372036854775807_3142 = torch.constant.int 9223372036854775807
    %int1_3143 = torch.constant.int 1
    %3080 = torch.aten.slice.Tensor %3079, %int1_3140, %int0_3141, %int9223372036854775807_3142, %int1_3143 : !torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %3080, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int2_3144 = torch.constant.int 2
    %3081 = torch.aten.unsqueeze %3080, %int2_3144 : !torch.vtensor<[1,?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %3081, [%548], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int3_3145 = torch.constant.int 3
    %int0_3146 = torch.constant.int 0
    %int9223372036854775807_3147 = torch.constant.int 9223372036854775807
    %int1_3148 = torch.constant.int 1
    %3082 = torch.aten.slice.Tensor %3081, %int3_3145, %int0_3146, %int9223372036854775807_3147, %int1_3148 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %3082, [%548], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int1_3149 = torch.constant.int 1
    %int2_3150 = torch.constant.int 2
    %3083 = torch.aten.transpose.int %3078, %int1_3149, %int2_3150 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,?,128],bf16>
    torch.bind_symbolic_shape %3083, [%548], affine_map<()[s0] -> (1, 1, s0 * 32, 128)> : !torch.vtensor<[1,1,?,128],bf16>
    %int1_3151 = torch.constant.int 1
    %int1_3152 = torch.constant.int 1
    %int1_3153 = torch.constant.int 1
    %int1_3154 = torch.constant.int 1
    %3084 = torch.prim.ListConstruct %int1_3151, %int1_3152, %int1_3153, %int1_3154 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3085 = torch.aten.repeat %3083, %3084 : !torch.vtensor<[1,1,?,128],bf16>, !torch.list<int> -> !torch.vtensor<[1,1,?,128],bf16>
    torch.bind_symbolic_shape %3085, [%548], affine_map<()[s0] -> (1, 1, s0 * 32, 128)> : !torch.vtensor<[1,1,?,128],bf16>
    %int1_3155 = torch.constant.int 1
    %int2_3156 = torch.constant.int 2
    %3086 = torch.aten.transpose.int %3082, %int1_3155, %int2_3156 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,?,128],bf16>
    torch.bind_symbolic_shape %3086, [%548], affine_map<()[s0] -> (1, 1, s0 * 32, 128)> : !torch.vtensor<[1,1,?,128],bf16>
    %int1_3157 = torch.constant.int 1
    %int2_3158 = torch.constant.int 2
    %3087 = torch.aten.transpose.int %3026, %int1_3157, %int2_3158 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
    torch.bind_symbolic_shape %3087, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %int1_3159 = torch.constant.int 1
    %int1_3160 = torch.constant.int 1
    %int1_3161 = torch.constant.int 1
    %int1_3162 = torch.constant.int 1
    %3088 = torch.prim.ListConstruct %int1_3159, %int1_3160, %int1_3161, %int1_3162 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3089 = torch.aten.repeat %3086, %3088 : !torch.vtensor<[1,1,?,128],bf16>, !torch.list<int> -> !torch.vtensor<[1,1,?,128],bf16>
    torch.bind_symbolic_shape %3089, [%548], affine_map<()[s0] -> (1, 1, s0 * 32, 128)> : !torch.vtensor<[1,1,?,128],bf16>
    %3090 = torch.aten.mul.Tensor %3087, %3085 : !torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,1,?,128],bf16> -> !torch.vtensor<[1,32,?,128],bf16>
    torch.bind_symbolic_shape %3090, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %int3_3163 = torch.constant.int 3
    %int0_3164 = torch.constant.int 0
    %int64_3165 = torch.constant.int 64
    %int1_3166 = torch.constant.int 1
    %3091 = torch.aten.slice.Tensor %3087, %int3_3163, %int0_3164, %int64_3165, %int1_3166 : !torch.vtensor<[1,32,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,64],bf16>
    torch.bind_symbolic_shape %3091, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 64)> : !torch.vtensor<[1,32,?,64],bf16>
    %int3_3167 = torch.constant.int 3
    %int64_3168 = torch.constant.int 64
    %int9223372036854775807_3169 = torch.constant.int 9223372036854775807
    %int1_3170 = torch.constant.int 1
    %3092 = torch.aten.slice.Tensor %3087, %int3_3167, %int64_3168, %int9223372036854775807_3169, %int1_3170 : !torch.vtensor<[1,32,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,64],bf16>
    torch.bind_symbolic_shape %3092, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 64)> : !torch.vtensor<[1,32,?,64],bf16>
    %3093 = torch.aten.neg %3092 : !torch.vtensor<[1,32,?,64],bf16> -> !torch.vtensor<[1,32,?,64],bf16>
    torch.bind_symbolic_shape %3093, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 64)> : !torch.vtensor<[1,32,?,64],bf16>
    %3094 = torch.prim.ListConstruct %3093, %3091 : (!torch.vtensor<[1,32,?,64],bf16>, !torch.vtensor<[1,32,?,64],bf16>) -> !torch.list<vtensor>
    %int-1_3171 = torch.constant.int -1
    %3095 = torch.aten.cat %3094, %int-1_3171 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
    torch.bind_symbolic_shape %3095, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %3096 = torch.aten.mul.Tensor %3095, %3089 : !torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,1,?,128],bf16> -> !torch.vtensor<[1,32,?,128],bf16>
    torch.bind_symbolic_shape %3096, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %int1_3172 = torch.constant.int 1
    %3097 = torch.aten.add.Tensor %3090, %3096, %int1_3172 : !torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,32,?,128],bf16>, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
    torch.bind_symbolic_shape %3097, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
    %int1_3173 = torch.constant.int 1
    %int2_3174 = torch.constant.int 2
    %3098 = torch.aten.transpose.int %3097, %int1_3173, %int2_3174 : !torch.vtensor<[1,32,?,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
    torch.bind_symbolic_shape %3098, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
    %int131072_3175 = torch.constant.int 131072
    %none_3176 = torch.constant.none
    %none_3177 = torch.constant.none
    %cpu_3178 = torch.constant.device "cpu"
    %false_3179 = torch.constant.bool false
    %3099 = torch.aten.arange %int131072_3175, %none_3176, %none_3177, %cpu_3178, %false_3179 : !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[131072],si64>
    %int0_3180 = torch.constant.int 0
    %int128_3181 = torch.constant.int 128
    %int2_3182 = torch.constant.int 2
    %int4_3183 = torch.constant.int 4
    %none_3184 = torch.constant.none
    %cpu_3185 = torch.constant.device "cpu"
    %false_3186 = torch.constant.bool false
    %3100 = torch.aten.arange.start_step %int0_3180, %int128_3181, %int2_3182, %int4_3183, %none_3184, %cpu_3185, %false_3186 : !torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[64],si64>
    %int6_3187 = torch.constant.int 6
    %3101 = torch.prims.convert_element_type %3100, %int6_3187 : !torch.vtensor<[64],si64>, !torch.int -> !torch.vtensor<[64],f32>
    %int128_3188 = torch.constant.int 128
    %3102 = torch.aten.div.Scalar %3101, %int128_3188 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %float5.000000e05_3189 = torch.constant.float 5.000000e+05
    %3103 = torch.aten.pow.Scalar %float5.000000e05_3189, %3102 : !torch.float, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %3104 = torch.aten.reciprocal %3103 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %float1.000000e00_3190 = torch.constant.float 1.000000e+00
    %3105 = torch.aten.mul.Scalar %3104, %float1.000000e00_3190 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],f32>
    %3106 = torch.aten.reciprocal %3105 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %float6.283190e00_3191 = torch.constant.float 6.2831853071795862
    %3107 = torch.aten.mul.Scalar %3106, %float6.283190e00_3191 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],f32>
    %float8.192000e03_3192 = torch.constant.float 8.192000e+03
    %3108 = torch.aten.gt.Scalar %3107, %float8.192000e03_3192 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1>
    %int8_3193 = torch.constant.int 8
    %3109 = torch.aten.div.Scalar %3105, %int8_3193 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %3110 = torch.aten.where.self %3108, %3109, %3105 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %3111 = torch.aten.reciprocal %3107 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %int8192_3194 = torch.constant.int 8192
    %3112 = torch.aten.mul.Scalar %3111, %int8192_3194 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %int1_3195 = torch.constant.int 1
    %int1_3196 = torch.constant.int 1
    %3113 = torch.aten.sub.Scalar %3112, %int1_3195, %int1_3196 : !torch.vtensor<[64],f32>, !torch.int, !torch.int -> !torch.vtensor<[64],f32>
    %int3_3197 = torch.constant.int 3
    %3114 = torch.aten.div.Scalar %3113, %int3_3197 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %int1_3198 = torch.constant.int 1
    %int1_3199 = torch.constant.int 1
    %3115 = torch.aten.rsub.Scalar %3114, %int1_3198, %int1_3199 : !torch.vtensor<[64],f32>, !torch.int, !torch.int -> !torch.vtensor<[64],f32>
    %3116 = torch.aten.mul.Tensor %3115, %3110 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %int8_3200 = torch.constant.int 8
    %3117 = torch.aten.div.Scalar %3116, %int8_3200 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %3118 = torch.aten.mul.Tensor %3114, %3110 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %int1_3201 = torch.constant.int 1
    %3119 = torch.aten.add.Tensor %3117, %3118, %int1_3201 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %float2.048000e03_3202 = torch.constant.float 2.048000e+03
    %3120 = torch.aten.lt.Scalar %3107, %float2.048000e03_3202 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1>
    %3121 = torch.aten.bitwise_not %3120 : !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1>
    %float8.192000e03_3203 = torch.constant.float 8.192000e+03
    %3122 = torch.aten.gt.Scalar %3107, %float8.192000e03_3203 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1>
    %3123 = torch.aten.bitwise_not %3122 : !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1>
    %3124 = torch.aten.mul.Tensor %3121, %3123 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1>
    %3125 = torch.aten.where.self %3124, %3119, %3110 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32>
    %3126 = torch.prim.ListConstruct %3125, %3125 : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor>
    %int-1_3204 = torch.constant.int -1
    %3127 = torch.aten.cat %3126, %int-1_3204 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[128],f32>
    %int6_3205 = torch.constant.int 6
    %3128 = torch.prims.convert_element_type %3099, %int6_3205 : !torch.vtensor<[131072],si64>, !torch.int -> !torch.vtensor<[131072],f32>
    %int131072_3206 = torch.constant.int 131072
    %int1_3207 = torch.constant.int 1
    %3129 = torch.prim.ListConstruct %int131072_3206, %int1_3207 : (!torch.int, !torch.int) -> !torch.list<int>
    %3130 = torch.aten.view %3128, %3129 : !torch.vtensor<[131072],f32>, !torch.list<int> -> !torch.vtensor<[131072,1],f32>
    %3131 = torch.aten.mul.Tensor %3130, %3127 : !torch.vtensor<[131072,1],f32>, !torch.vtensor<[128],f32> -> !torch.vtensor<[131072,128],f32>
    %3132 = torch.aten.cos %3131 : !torch.vtensor<[131072,128],f32> -> !torch.vtensor<[131072,128],f32>
    %int15_3208 = torch.constant.int 15
    %3133 = torch.prims.convert_element_type %3132, %int15_3208 : !torch.vtensor<[131072,128],f32>, !torch.int -> !torch.vtensor<[131072,128],bf16>
    %3134 = torch.aten.sin %3131 : !torch.vtensor<[131072,128],f32> -> !torch.vtensor<[131072,128],f32>
    %int15_3209 = torch.constant.int 15
    %3135 = torch.prims.convert_element_type %3134, %int15_3209 : !torch.vtensor<[131072,128],f32>, !torch.int -> !torch.vtensor<[131072,128],bf16>
    %int1_3210 = torch.constant.int 1
    %3136 = torch.aten.size.int %3013, %int1_3210 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.int
    %int0_3211 = torch.constant.int 0
    %3137 = torch.aten.add.int %int0_3211, %3136 : !torch.int, !torch.int -> !torch.int
    %int0_3212 = torch.constant.int 0
    %int0_3213 = torch.constant.int 0
    %int1_3214 = torch.constant.int 1
    %3138 = torch.aten.slice.Tensor %3133, %int0_3212, %int0_3213, %3137, %int1_3214 : !torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %3138, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int1_3215 = torch.constant.int 1
    %int0_3216 = torch.constant.int 0
    %int9223372036854775807_3217 = torch.constant.int 9223372036854775807
    %int1_3218 = torch.constant.int 1
    %3139 = torch.aten.slice.Tensor %3138, %int1_3215, %int0_3216, %int9223372036854775807_3217, %int1_3218 : !torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %3139, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int0_3219 = torch.constant.int 0
    %3140 = torch.aten.add.int %int0_3219, %3136 : !torch.int, !torch.int -> !torch.int
    %int0_3220 = torch.constant.int 0
    %int0_3221 = torch.constant.int 0
    %int1_3222 = torch.constant.int 1
    %3141 = torch.aten.slice.Tensor %3135, %int0_3220, %int0_3221, %3140, %int1_3222 : !torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %3141, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int1_3223 = torch.constant.int 1
    %int0_3224 = torch.constant.int 0
    %int9223372036854775807_3225 = torch.constant.int 9223372036854775807
    %int1_3226 = torch.constant.int 1
    %3142 = torch.aten.slice.Tensor %3141, %int1_3223, %int0_3224, %int9223372036854775807_3225, %int1_3226 : !torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16>
    torch.bind_symbolic_shape %3142, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16>
    %int0_3227 = torch.constant.int 0
    %3143 = torch.aten.unsqueeze %3139, %int0_3227 : !torch.vtensor<[?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %3143, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int1_3228 = torch.constant.int 1
    %int0_3229 = torch.constant.int 0
    %int9223372036854775807_3230 = torch.constant.int 9223372036854775807
    %int1_3231 = torch.constant.int 1
    %3144 = torch.aten.slice.Tensor %3143, %int1_3228, %int0_3229, %int9223372036854775807_3230, %int1_3231 : !torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %3144, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int2_3232 = torch.constant.int 2
    %3145 = torch.aten.unsqueeze %3144, %int2_3232 : !torch.vtensor<[1,?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %3145, [%548], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int3_3233 = torch.constant.int 3
    %int0_3234 = torch.constant.int 0
    %int9223372036854775807_3235 = torch.constant.int 9223372036854775807
    %int1_3236 = torch.constant.int 1
    %3146 = torch.aten.slice.Tensor %3145, %int3_3233, %int0_3234, %int9223372036854775807_3235, %int1_3236 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %3146, [%548], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int0_3237 = torch.constant.int 0
    %3147 = torch.aten.unsqueeze %3142, %int0_3237 : !torch.vtensor<[?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %3147, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int1_3238 = torch.constant.int 1
    %int0_3239 = torch.constant.int 0
    %int9223372036854775807_3240 = torch.constant.int 9223372036854775807
    %int1_3241 = torch.constant.int 1
    %3148 = torch.aten.slice.Tensor %3147, %int1_3238, %int0_3239, %int9223372036854775807_3240, %int1_3241 : !torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],bf16>
    torch.bind_symbolic_shape %3148, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16>
    %int2_3242 = torch.constant.int 2
    %3149 = torch.aten.unsqueeze %3148, %int2_3242 : !torch.vtensor<[1,?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %3149, [%548], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int3_3243 = torch.constant.int 3
    %int0_3244 = torch.constant.int 0
    %int9223372036854775807_3245 = torch.constant.int 9223372036854775807
    %int1_3246 = torch.constant.int 1
    %3150 = torch.aten.slice.Tensor %3149, %int3_3243, %int0_3244, %int9223372036854775807_3245, %int1_3246 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,1,128],bf16>
    torch.bind_symbolic_shape %3150, [%548], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16>
    %int1_3247 = torch.constant.int 1
    %int2_3248 = torch.constant.int 2
    %3151 = torch.aten.transpose.int %3146, %int1_3247, %int2_3248 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,?,128],bf16>
    torch.bind_symbolic_shape %3151, [%548], affine_map<()[s0] -> (1, 1, s0 * 32, 128)> : !torch.vtensor<[1,1,?,128],bf16>
    %int1_3249 = torch.constant.int 1
    %int1_3250 = torch.constant.int 1
    %int1_3251 = torch.constant.int 1
    %int1_3252 = torch.constant.int 1
    %3152 = torch.prim.ListConstruct %int1_3249, %int1_3250, %int1_3251, %int1_3252 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3153 = torch.aten.repeat %3151, %3152 : !torch.vtensor<[1,1,?,128],bf16>, !torch.list<int> -> !torch.vtensor<[1,1,?,128],bf16>
    torch.bind_symbolic_shape %3153, [%548], affine_map<()[s0] -> (1, 1, s0 * 32, 128)> : !torch.vtensor<[1,1,?,128],bf16>
    %int1_3253 = torch.constant.int 1
    %int2_3254 = torch.constant.int 2
    %3154 = torch.aten.transpose.int %3150, %int1_3253, %int2_3254 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,?,128],bf16>
    torch.bind_symbolic_shape %3154, [%548], affine_map<()[s0] -> (1, 1, s0 * 32, 128)> : !torch.vtensor<[1,1,?,128],bf16>
    %int1_3255 = torch.constant.int 1
    %int2_3256 = torch.constant.int 2
    %3155 = torch.aten.transpose.int %3028, %int1_3255, %int2_3256 : !torch.vtensor<[1,?,8,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,8,?,128],bf16>
    torch.bind_symbolic_shape %3155, [%548], affine_map<()[s0] -> (1, 8, s0 * 32, 128)> : !torch.vtensor<[1,8,?,128],bf16>
    %int1_3257 = torch.constant.int 1
    %int1_3258 = torch.constant.int 1
    %int1_3259 = torch.constant.int 1
    %int1_3260 = torch.constant.int 1
    %3156 = torch.prim.ListConstruct %int1_3257, %int1_3258, %int1_3259, %int1_3260 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3157 = torch.aten.repeat %3154, %3156 : !torch.vtensor<[1,1,?,128],bf16>, !torch.list<int> -> !torch.vtensor<[1,1,?,128],bf16>
    torch.bind_symbolic_shape %3157, [%548], affine_map<()[s0] -> (1, 1, s0 * 32, 128)> : !torch.vtensor<[1,1,?,128],bf16>
    %3158 = torch.aten.mul.Tensor %3155, %3153 : !torch.vtensor<[1,8,?,128],bf16>, !torch.vtensor<[1,1,?,128],bf16> -> !torch.vtensor<[1,8,?,128],bf16>
    torch.bind_symbolic_shape %3158, [%548], affine_map<()[s0] -> (1, 8, s0 * 32, 128)> : !torch.vtensor<[1,8,?,128],bf16>
    %int3_3261 = torch.constant.int 3
    %int0_3262 = torch.constant.int 0
    %int64_3263 = torch.constant.int 64
    %int1_3264 = torch.constant.int 1
    %3159 = torch.aten.slice.Tensor %3155, %int3_3261, %int0_3262, %int64_3263, %int1_3264 : !torch.vtensor<[1,8,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,8,?,64],bf16>
    torch.bind_symbolic_shape %3159, [%548], affine_map<()[s0] -> (1, 8, s0 * 32, 64)> : !torch.vtensor<[1,8,?,64],bf16>
    %int3_3265 = torch.constant.int 3
    %int64_3266 = torch.constant.int 64
    %int9223372036854775807_3267 = torch.constant.int 9223372036854775807
    %int1_3268 = torch.constant.int 1
    %3160 = torch.aten.slice.Tensor %3155, %int3_3265, %int64_3266, %int9223372036854775807_3267, %int1_3268 : !torch.vtensor<[1,8,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,8,?,64],bf16>
    torch.bind_symbolic_shape %3160, [%548], affine_map<()[s0] -> (1, 8, s0 * 32, 64)> : !torch.vtensor<[1,8,?,64],bf16>
    %3161 = torch.aten.neg %3160 : !torch.vtensor<[1,8,?,64],bf16> -> !torch.vtensor<[1,8,?,64],bf16>
    torch.bind_symbolic_shape %3161, [%548], affine_map<()[s0] -> (1, 8, s0 * 32, 64)> : !torch.vtensor<[1,8,?,64],bf16>
    %3162 = torch.prim.ListConstruct %3161, %3159 : (!torch.vtensor<[1,8,?,64],bf16>, !torch.vtensor<[1,8,?,64],bf16>) -> !torch.list<vtensor>
    %int-1_3269 = torch.constant.int -1
    %3163 = torch.aten.cat %3162, %int-1_3269 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,8,?,128],bf16>
    torch.bind_symbolic_shape %3163, [%548], affine_map<()[s0] -> (1, 8, s0 * 32, 128)> : !torch.vtensor<[1,8,?,128],bf16>
    %3164 = torch.aten.mul.Tensor %3163, %3157 : !torch.vtensor<[1,8,?,128],bf16>, !torch.vtensor<[1,1,?,128],bf16> -> !torch.vtensor<[1,8,?,128],bf16>
    torch.bind_symbolic_shape %3164, [%548], affine_map<()[s0] -> (1, 8, s0 * 32, 128)> : !torch.vtensor<[1,8,?,128],bf16>
    %int1_3270 = torch.constant.int 1
    %3165 = torch.aten.add.Tensor %3158, %3164, %int1_3270 : !torch.vtensor<[1,8,?,128],bf16>, !torch.vtensor<[1,8,?,128],bf16>, !torch.int -> !torch.vtensor<[1,8,?,128],bf16>
    torch.bind_symbolic_shape %3165, [%548], affine_map<()[s0] -> (1, 8, s0 * 32, 128)> : !torch.vtensor<[1,8,?,128],bf16>
    %int1_3271 = torch.constant.int 1
    %int2_3272 = torch.constant.int 2
    %3166 = torch.aten.transpose.int %3165, %int1_3271, %int2_3272 : !torch.vtensor<[1,8,?,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %3166, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %3167 = torch.aten.div.Tensor %3166, %144 : !torch.vtensor<[1,?,8,128],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %3167, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %float-2.400000e02_3273 = torch.constant.float -2.400000e+02
    %float2.400000e02_3274 = torch.constant.float 2.400000e+02
    %3168 = torch.aten.clamp %3167, %float-2.400000e02_3273, %float2.400000e02_3274 : !torch.vtensor<[1,?,8,128],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %3168, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %int26_3275 = torch.constant.int 26
    %3169 = torch.prims.convert_element_type %3168, %int26_3275 : !torch.vtensor<[1,?,8,128],bf16>, !torch.int -> !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %3169, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
    %3170 = torch.aten.div.Tensor %3030, %144 : !torch.vtensor<[1,?,8,128],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %3170, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %float-2.400000e02_3276 = torch.constant.float -2.400000e+02
    %float2.400000e02_3277 = torch.constant.float 2.400000e+02
    %3171 = torch.aten.clamp %3170, %float-2.400000e02_3276, %float2.400000e02_3277 : !torch.vtensor<[1,?,8,128],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,8,128],bf16>
    torch.bind_symbolic_shape %3171, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
    %int26_3278 = torch.constant.int 26
    %3172 = torch.prims.convert_element_type %3171, %int26_3278 : !torch.vtensor<[1,?,8,128],bf16>, !torch.int -> !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %3172, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
    %int64_3279 = torch.constant.int 64
    %3173 = torch.aten.mul.Scalar %arg2, %int64_3279 : !torch.vtensor<[1,?],si64>, !torch.int -> !torch.vtensor<[1,?],si64>
    torch.bind_symbolic_shape %3173, [%548], affine_map<()[s0] -> (1, s0)> : !torch.vtensor<[1,?],si64>
    %int16 = torch.constant.int 16
    %int1_3280 = torch.constant.int 1
    %3174 = torch.aten.add.Scalar %3173, %int16, %int1_3280 : !torch.vtensor<[1,?],si64>, !torch.int, !torch.int -> !torch.vtensor<[1,?],si64>
    torch.bind_symbolic_shape %3174, [%548], affine_map<()[s0] -> (1, s0)> : !torch.vtensor<[1,?],si64>
    %int1_3281 = torch.constant.int 1
    %int32_3282 = torch.constant.int 32
    %int8_3283 = torch.constant.int 8
    %int128_3284 = torch.constant.int 128
    %3175 = torch.prim.ListConstruct %int1_3281, %748, %int32_3282, %int8_3283, %int128_3284 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3176 = torch.aten.view %3169, %3175 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %3176, [%548], affine_map<()[s0] -> (1, s0, 32, 8, 128)> : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
    %int32_3285 = torch.constant.int 32
    %int8_3286 = torch.constant.int 8
    %int128_3287 = torch.constant.int 128
    %3177 = torch.prim.ListConstruct %748, %int32_3285, %int8_3286, %int128_3287 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3178 = torch.aten.view %3176, %3177 : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %3178, [%548], affine_map<()[s0] -> (s0, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %3179 = torch.prim.ListConstruct %748 : (!torch.int) -> !torch.list<int>
    %3180 = torch.aten.view %3174, %3179 : !torch.vtensor<[1,?],si64>, !torch.list<int> -> !torch.vtensor<[?],si64>
    torch.bind_symbolic_shape %3180, [%548], affine_map<()[s0] -> (s0)> : !torch.vtensor<[?],si64>
    %int32_3288 = torch.constant.int 32
    %int2_3289 = torch.constant.int 2
    %int32_3290 = torch.constant.int 32
    %int8_3291 = torch.constant.int 8
    %int128_3292 = torch.constant.int 128
    %3181 = torch.prim.ListConstruct %739, %int32_3288, %int2_3289, %int32_3290, %int8_3291, %int128_3292 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3182 = torch.aten.view %2905, %3181 : !torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %3182, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    %int32_3293 = torch.constant.int 32
    %3183 = torch.aten.mul.int %739, %int32_3293 : !torch.int, !torch.int -> !torch.int
    %int2_3294 = torch.constant.int 2
    %3184 = torch.aten.mul.int %3183, %int2_3294 : !torch.int, !torch.int -> !torch.int
    %int32_3295 = torch.constant.int 32
    %int8_3296 = torch.constant.int 8
    %int128_3297 = torch.constant.int 128
    %3185 = torch.prim.ListConstruct %3184, %int32_3295, %int8_3296, %int128_3297 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3186 = torch.aten.view %3182, %3185 : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %3186, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %3187 = torch.prim.ListConstruct %3180 : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
    %false_3298 = torch.constant.bool false
    %3188 = torch.aten.index_put %3186, %3187, %3178, %false_3298 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %3188, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %int32_3299 = torch.constant.int 32
    %int2_3300 = torch.constant.int 2
    %int32_3301 = torch.constant.int 32
    %int8_3302 = torch.constant.int 8
    %int128_3303 = torch.constant.int 128
    %3189 = torch.prim.ListConstruct %739, %int32_3299, %int2_3300, %int32_3301, %int8_3302, %int128_3303 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3190 = torch.aten.view %3188, %3189 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %3190, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    %int2097152_3304 = torch.constant.int 2097152
    %3191 = torch.prim.ListConstruct %739, %int2097152_3304 : (!torch.int, !torch.int) -> !torch.list<int>
    %3192 = torch.aten.view %3190, %3191 : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    torch.bind_symbolic_shape %3192, [%549], affine_map<()[s0] -> (s0, 2097152)> : !torch.vtensor<[?,2097152],f8E4M3FNUZ>
    %int32_3305 = torch.constant.int 32
    %int2_3306 = torch.constant.int 2
    %int32_3307 = torch.constant.int 32
    %int8_3308 = torch.constant.int 8
    %int128_3309 = torch.constant.int 128
    %3193 = torch.prim.ListConstruct %739, %int32_3305, %int2_3306, %int32_3307, %int8_3308, %int128_3309 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3194 = torch.aten.view %3192, %3193 : !torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %3194, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>
    %int32_3310 = torch.constant.int 32
    %int8_3311 = torch.constant.int 8
    %int128_3312 = torch.constant.int 128
    %3195 = torch.prim.ListConstruct %3184, %int32_3310, %int8_3311, %int128_3312 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3196 = torch.aten.view %3194, %3195 : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %3196, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %int1_3313 = torch.constant.int 1
    %int32_3314 = torch.constant.int 32
    %int8_3315 = torch.constant.int 8
    %int128_3316 = torch.constant.int 128
    %3197 = torch.prim.ListConstruct %int1_3313, %748, %int32_3314, %int8_3315, %int128_3316 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3198 = torch.aten.view %3172, %3197 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %3198, [%548], affine_map<()[s0] -> (1, s0, 32, 8, 128)> : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
    %int32_3317 = torch.constant.int 32
    %int8_3318 = torch.constant.int 8
    %int128_3319 = torch.constant.int 128
    %3199 = torch.prim.ListConstruct %748, %int32_3317, %int8_3318, %int128_3319 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3200 = torch.aten.view %3198, %3199 : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    torch.bind_symbolic_shape %3200, [%548], affine_map<()[s0] -> (s0, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
    %int1_3320 = torch.constant.int 1
    %int1_3321 = torch.constant.int 1
    %3201 = torch.aten.add.Scalar %3174, %int1_3320, %int1_3321 : !torch.vtensor<[1,?],si64>, !torch.int, !torch.int -> !torch.vtensor<[1,?],si64>
    torch.bind_symbolic_shape %3201, [%