Skip to content

Instantly share code, notes, and snippets.

@AmosLewis
Created February 2, 2025 19:32
Show Gist options
  • Save AmosLewis/9107eafb6af92627ab085065a2cbf654 to your computer and use it in GitHub Desktop.
Save AmosLewis/9107eafb6af92627ab085065a2cbf654 to your computer and use it in GitHub Desktop.
This file has been truncated, but you can view the full file.
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
module @module {
util.global private @__auto.token_embd.weight = #stream.parameter.named<"model"::"token_embd.weight"> : tensor<128256x4096xbf16>
util.global private @__auto.blk.0.attn_norm.weight = #stream.parameter.named<"model"::"blk.0.attn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.0.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.0.attn_q.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.0.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.0.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.0.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.0.attn_k.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.0.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.0.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.0.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.0.attn_v.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.0.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.0.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.0.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.0.kv_cache.quantizer:rscale"> : tensor<f32>
util.global private @"__auto.blk.0.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.0.attn_output.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.0.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.0.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @__auto.blk.0.ffn_norm.weight = #stream.parameter.named<"model"::"blk.0.ffn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.0.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.0.ffn_gate.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.0.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.0.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.0.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.0.ffn_up.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.0.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.0.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.0.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.0.ffn_down.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.0.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.0.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
util.global private @__auto.blk.1.attn_norm.weight = #stream.parameter.named<"model"::"blk.1.attn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.1.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.1.attn_q.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.1.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.1.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.1.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.1.attn_k.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.1.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.1.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.1.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.1.attn_v.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.1.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.1.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.1.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.1.kv_cache.quantizer:rscale"> : tensor<f32>
util.global private @"__auto.blk.1.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.1.attn_output.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.1.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.1.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @__auto.blk.1.ffn_norm.weight = #stream.parameter.named<"model"::"blk.1.ffn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.1.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.1.ffn_gate.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.1.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.1.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.1.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.1.ffn_up.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.1.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.1.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.1.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.1.ffn_down.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.1.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.1.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
util.global private @__auto.blk.2.attn_norm.weight = #stream.parameter.named<"model"::"blk.2.attn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.2.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.2.attn_q.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.2.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.2.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.2.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.2.attn_k.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.2.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.2.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.2.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.2.attn_v.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.2.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.2.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.2.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.2.kv_cache.quantizer:rscale"> : tensor<f32>
util.global private @"__auto.blk.2.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.2.attn_output.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.2.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.2.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @__auto.blk.2.ffn_norm.weight = #stream.parameter.named<"model"::"blk.2.ffn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.2.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.2.ffn_gate.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.2.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.2.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.2.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.2.ffn_up.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.2.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.2.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.2.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.2.ffn_down.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.2.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.2.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
util.global private @__auto.blk.3.attn_norm.weight = #stream.parameter.named<"model"::"blk.3.attn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.3.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.3.attn_q.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.3.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.3.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.3.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.3.attn_k.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.3.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.3.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.3.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.3.attn_v.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.3.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.3.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.3.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.3.kv_cache.quantizer:rscale"> : tensor<f32>
util.global private @"__auto.blk.3.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.3.attn_output.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.3.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.3.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @__auto.blk.3.ffn_norm.weight = #stream.parameter.named<"model"::"blk.3.ffn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.3.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.3.ffn_gate.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.3.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.3.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.3.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.3.ffn_up.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.3.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.3.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.3.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.3.ffn_down.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.3.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.3.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
util.global private @__auto.blk.4.attn_norm.weight = #stream.parameter.named<"model"::"blk.4.attn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.4.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.4.attn_q.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.4.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.4.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.4.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.4.attn_k.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.4.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.4.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.4.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.4.attn_v.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.4.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.4.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.4.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.4.kv_cache.quantizer:rscale"> : tensor<f32>
util.global private @"__auto.blk.4.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.4.attn_output.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.4.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.4.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @__auto.blk.4.ffn_norm.weight = #stream.parameter.named<"model"::"blk.4.ffn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.4.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.4.ffn_gate.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.4.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.4.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.4.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.4.ffn_up.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.4.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.4.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.4.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.4.ffn_down.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.4.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.4.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
util.global private @__auto.blk.5.attn_norm.weight = #stream.parameter.named<"model"::"blk.5.attn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.5.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.5.attn_q.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.5.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.5.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.5.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.5.attn_k.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.5.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.5.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.5.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.5.attn_v.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.5.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.5.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.5.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.5.kv_cache.quantizer:rscale"> : tensor<f32>
util.global private @"__auto.blk.5.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.5.attn_output.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.5.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.5.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @__auto.blk.5.ffn_norm.weight = #stream.parameter.named<"model"::"blk.5.ffn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.5.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.5.ffn_gate.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.5.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.5.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.5.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.5.ffn_up.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.5.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.5.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.5.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.5.ffn_down.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.5.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.5.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
util.global private @__auto.blk.6.attn_norm.weight = #stream.parameter.named<"model"::"blk.6.attn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.6.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.6.attn_q.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.6.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.6.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.6.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.6.attn_k.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.6.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.6.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.6.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.6.attn_v.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.6.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.6.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.6.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.6.kv_cache.quantizer:rscale"> : tensor<f32>
util.global private @"__auto.blk.6.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.6.attn_output.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.6.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.6.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @__auto.blk.6.ffn_norm.weight = #stream.parameter.named<"model"::"blk.6.ffn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.6.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.6.ffn_gate.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.6.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.6.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.6.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.6.ffn_up.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.6.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.6.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.6.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.6.ffn_down.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.6.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.6.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
util.global private @__auto.blk.7.attn_norm.weight = #stream.parameter.named<"model"::"blk.7.attn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.7.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.7.attn_q.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.7.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.7.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.7.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.7.attn_k.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.7.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.7.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.7.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.7.attn_v.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.7.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.7.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.7.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.7.kv_cache.quantizer:rscale"> : tensor<f32>
util.global private @"__auto.blk.7.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.7.attn_output.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.7.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.7.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @__auto.blk.7.ffn_norm.weight = #stream.parameter.named<"model"::"blk.7.ffn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.7.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.7.ffn_gate.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.7.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.7.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.7.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.7.ffn_up.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.7.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.7.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.7.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.7.ffn_down.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.7.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.7.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
util.global private @__auto.blk.8.attn_norm.weight = #stream.parameter.named<"model"::"blk.8.attn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.8.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.8.attn_q.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.8.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.8.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.8.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.8.attn_k.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.8.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.8.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.8.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.8.attn_v.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.8.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.8.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.8.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.8.kv_cache.quantizer:rscale"> : tensor<f32>
util.global private @"__auto.blk.8.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.8.attn_output.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.8.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.8.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @__auto.blk.8.ffn_norm.weight = #stream.parameter.named<"model"::"blk.8.ffn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.8.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.8.ffn_gate.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.8.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.8.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.8.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.8.ffn_up.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.8.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.8.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.8.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.8.ffn_down.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.8.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.8.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
util.global private @__auto.blk.9.attn_norm.weight = #stream.parameter.named<"model"::"blk.9.attn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.9.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.9.attn_q.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.9.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.9.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.9.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.9.attn_k.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.9.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.9.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.9.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.9.attn_v.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.9.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.9.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.9.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.9.kv_cache.quantizer:rscale"> : tensor<f32>
util.global private @"__auto.blk.9.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.9.attn_output.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.9.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.9.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @__auto.blk.9.ffn_norm.weight = #stream.parameter.named<"model"::"blk.9.ffn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.9.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.9.ffn_gate.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.9.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.9.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.9.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.9.ffn_up.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.9.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.9.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.9.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.9.ffn_down.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.9.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.9.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
util.global private @__auto.blk.10.attn_norm.weight = #stream.parameter.named<"model"::"blk.10.attn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.10.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.10.attn_q.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.10.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.10.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.10.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.10.attn_k.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.10.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.10.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.10.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.10.attn_v.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.10.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.10.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.10.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.10.kv_cache.quantizer:rscale"> : tensor<f32>
util.global private @"__auto.blk.10.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.10.attn_output.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.10.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.10.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @__auto.blk.10.ffn_norm.weight = #stream.parameter.named<"model"::"blk.10.ffn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.10.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.10.ffn_gate.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.10.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.10.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.10.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.10.ffn_up.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.10.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.10.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.10.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.10.ffn_down.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.10.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.10.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
util.global private @__auto.blk.11.attn_norm.weight = #stream.parameter.named<"model"::"blk.11.attn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.11.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.11.attn_q.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.11.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.11.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.11.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.11.attn_k.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.11.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.11.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.11.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.11.attn_v.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.11.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.11.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.11.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.11.kv_cache.quantizer:rscale"> : tensor<f32>
util.global private @"__auto.blk.11.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.11.attn_output.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.11.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.11.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @__auto.blk.11.ffn_norm.weight = #stream.parameter.named<"model"::"blk.11.ffn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.11.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.11.ffn_gate.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.11.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.11.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.11.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.11.ffn_up.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.11.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.11.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.11.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.11.ffn_down.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.11.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.11.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
util.global private @__auto.blk.12.attn_norm.weight = #stream.parameter.named<"model"::"blk.12.attn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.12.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.12.attn_q.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.12.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.12.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.12.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.12.attn_k.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.12.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.12.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.12.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.12.attn_v.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.12.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.12.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.12.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.12.kv_cache.quantizer:rscale"> : tensor<f32>
util.global private @"__auto.blk.12.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.12.attn_output.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.12.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.12.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @__auto.blk.12.ffn_norm.weight = #stream.parameter.named<"model"::"blk.12.ffn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.12.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.12.ffn_gate.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.12.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.12.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.12.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.12.ffn_up.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.12.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.12.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.12.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.12.ffn_down.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.12.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.12.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
util.global private @__auto.blk.13.attn_norm.weight = #stream.parameter.named<"model"::"blk.13.attn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.13.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.13.attn_q.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.13.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.13.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.13.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.13.attn_k.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.13.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.13.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.13.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.13.attn_v.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.13.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.13.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.13.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.13.kv_cache.quantizer:rscale"> : tensor<f32>
util.global private @"__auto.blk.13.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.13.attn_output.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.13.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.13.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @__auto.blk.13.ffn_norm.weight = #stream.parameter.named<"model"::"blk.13.ffn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.13.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.13.ffn_gate.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.13.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.13.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.13.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.13.ffn_up.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.13.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.13.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.13.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.13.ffn_down.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.13.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.13.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
util.global private @__auto.blk.14.attn_norm.weight = #stream.parameter.named<"model"::"blk.14.attn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.14.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.14.attn_q.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.14.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.14.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.14.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.14.attn_k.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.14.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.14.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.14.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.14.attn_v.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.14.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.14.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.14.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.14.kv_cache.quantizer:rscale"> : tensor<f32>
util.global private @"__auto.blk.14.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.14.attn_output.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.14.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.14.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @__auto.blk.14.ffn_norm.weight = #stream.parameter.named<"model"::"blk.14.ffn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.14.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.14.ffn_gate.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.14.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.14.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.14.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.14.ffn_up.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.14.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.14.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.14.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.14.ffn_down.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.14.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.14.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
util.global private @__auto.blk.15.attn_norm.weight = #stream.parameter.named<"model"::"blk.15.attn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.15.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.15.attn_q.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.15.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.15.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.15.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.15.attn_k.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.15.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.15.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.15.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.15.attn_v.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.15.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.15.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.15.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.15.kv_cache.quantizer:rscale"> : tensor<f32>
util.global private @"__auto.blk.15.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.15.attn_output.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.15.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.15.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @__auto.blk.15.ffn_norm.weight = #stream.parameter.named<"model"::"blk.15.ffn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.15.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.15.ffn_gate.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.15.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.15.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.15.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.15.ffn_up.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.15.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.15.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.15.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.15.ffn_down.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.15.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.15.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
util.global private @__auto.blk.16.attn_norm.weight = #stream.parameter.named<"model"::"blk.16.attn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.16.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.16.attn_q.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.16.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.16.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.16.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.16.attn_k.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.16.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.16.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.16.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.16.attn_v.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.16.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.16.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.16.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.16.kv_cache.quantizer:rscale"> : tensor<f32>
util.global private @"__auto.blk.16.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.16.attn_output.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.16.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.16.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @__auto.blk.16.ffn_norm.weight = #stream.parameter.named<"model"::"blk.16.ffn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.16.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.16.ffn_gate.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.16.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.16.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.16.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.16.ffn_up.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.16.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.16.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.16.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.16.ffn_down.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.16.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.16.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
util.global private @__auto.blk.17.attn_norm.weight = #stream.parameter.named<"model"::"blk.17.attn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.17.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.17.attn_q.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.17.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.17.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.17.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.17.attn_k.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.17.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.17.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.17.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.17.attn_v.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.17.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.17.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.17.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.17.kv_cache.quantizer:rscale"> : tensor<f32>
util.global private @"__auto.blk.17.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.17.attn_output.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.17.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.17.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @__auto.blk.17.ffn_norm.weight = #stream.parameter.named<"model"::"blk.17.ffn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.17.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.17.ffn_gate.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.17.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.17.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.17.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.17.ffn_up.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.17.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.17.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.17.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.17.ffn_down.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.17.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.17.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
util.global private @__auto.blk.18.attn_norm.weight = #stream.parameter.named<"model"::"blk.18.attn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.18.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.18.attn_q.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.18.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.18.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.18.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.18.attn_k.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.18.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.18.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.18.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.18.attn_v.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.18.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.18.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.18.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.18.kv_cache.quantizer:rscale"> : tensor<f32>
util.global private @"__auto.blk.18.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.18.attn_output.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.18.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.18.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @__auto.blk.18.ffn_norm.weight = #stream.parameter.named<"model"::"blk.18.ffn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.18.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.18.ffn_gate.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.18.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.18.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.18.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.18.ffn_up.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.18.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.18.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.18.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.18.ffn_down.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.18.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.18.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
util.global private @__auto.blk.19.attn_norm.weight = #stream.parameter.named<"model"::"blk.19.attn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.19.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.19.attn_q.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.19.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.19.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.19.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.19.attn_k.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.19.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.19.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.19.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.19.attn_v.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.19.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.19.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.19.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.19.kv_cache.quantizer:rscale"> : tensor<f32>
util.global private @"__auto.blk.19.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.19.attn_output.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.19.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.19.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @__auto.blk.19.ffn_norm.weight = #stream.parameter.named<"model"::"blk.19.ffn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.19.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.19.ffn_gate.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.19.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.19.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.19.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.19.ffn_up.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.19.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.19.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.19.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.19.ffn_down.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.19.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.19.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
util.global private @__auto.blk.20.attn_norm.weight = #stream.parameter.named<"model"::"blk.20.attn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.20.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.20.attn_q.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.20.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.20.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.20.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.20.attn_k.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.20.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.20.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.20.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.20.attn_v.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.20.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.20.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.20.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.20.kv_cache.quantizer:rscale"> : tensor<f32>
util.global private @"__auto.blk.20.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.20.attn_output.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.20.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.20.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @__auto.blk.20.ffn_norm.weight = #stream.parameter.named<"model"::"blk.20.ffn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.20.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.20.ffn_gate.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.20.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.20.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.20.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.20.ffn_up.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.20.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.20.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.20.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.20.ffn_down.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.20.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.20.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
util.global private @__auto.blk.21.attn_norm.weight = #stream.parameter.named<"model"::"blk.21.attn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.21.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.21.attn_q.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.21.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.21.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.21.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.21.attn_k.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.21.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.21.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.21.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.21.attn_v.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.21.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.21.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.21.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.21.kv_cache.quantizer:rscale"> : tensor<f32>
util.global private @"__auto.blk.21.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.21.attn_output.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.21.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.21.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @__auto.blk.21.ffn_norm.weight = #stream.parameter.named<"model"::"blk.21.ffn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.21.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.21.ffn_gate.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.21.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.21.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.21.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.21.ffn_up.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.21.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.21.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.21.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.21.ffn_down.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.21.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.21.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
util.global private @__auto.blk.22.attn_norm.weight = #stream.parameter.named<"model"::"blk.22.attn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.22.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.22.attn_q.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.22.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.22.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.22.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.22.attn_k.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.22.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.22.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.22.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.22.attn_v.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.22.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.22.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.22.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.22.kv_cache.quantizer:rscale"> : tensor<f32>
util.global private @"__auto.blk.22.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.22.attn_output.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.22.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.22.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @__auto.blk.22.ffn_norm.weight = #stream.parameter.named<"model"::"blk.22.ffn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.22.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.22.ffn_gate.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.22.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.22.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.22.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.22.ffn_up.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.22.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.22.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.22.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.22.ffn_down.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.22.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.22.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
util.global private @__auto.blk.23.attn_norm.weight = #stream.parameter.named<"model"::"blk.23.attn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.23.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.23.attn_q.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.23.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.23.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.23.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.23.attn_k.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.23.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.23.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.23.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.23.attn_v.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.23.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.23.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.23.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.23.kv_cache.quantizer:rscale"> : tensor<f32>
util.global private @"__auto.blk.23.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.23.attn_output.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.23.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.23.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @__auto.blk.23.ffn_norm.weight = #stream.parameter.named<"model"::"blk.23.ffn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.23.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.23.ffn_gate.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.23.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.23.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.23.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.23.ffn_up.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.23.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.23.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.23.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.23.ffn_down.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.23.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.23.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
util.global private @__auto.blk.24.attn_norm.weight = #stream.parameter.named<"model"::"blk.24.attn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.24.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.24.attn_q.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.24.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.24.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.24.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.24.attn_k.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.24.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.24.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.24.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.24.attn_v.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.24.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.24.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.24.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.24.kv_cache.quantizer:rscale"> : tensor<f32>
util.global private @"__auto.blk.24.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.24.attn_output.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.24.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.24.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @__auto.blk.24.ffn_norm.weight = #stream.parameter.named<"model"::"blk.24.ffn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.24.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.24.ffn_gate.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.24.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.24.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.24.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.24.ffn_up.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.24.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.24.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.24.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.24.ffn_down.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.24.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.24.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
util.global private @__auto.blk.25.attn_norm.weight = #stream.parameter.named<"model"::"blk.25.attn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.25.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.25.attn_q.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.25.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.25.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.25.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.25.attn_k.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.25.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.25.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.25.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.25.attn_v.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.25.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.25.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.25.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.25.kv_cache.quantizer:rscale"> : tensor<f32>
util.global private @"__auto.blk.25.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.25.attn_output.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.25.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.25.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @__auto.blk.25.ffn_norm.weight = #stream.parameter.named<"model"::"blk.25.ffn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.25.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.25.ffn_gate.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.25.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.25.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.25.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.25.ffn_up.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.25.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.25.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.25.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.25.ffn_down.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.25.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.25.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
util.global private @__auto.blk.26.attn_norm.weight = #stream.parameter.named<"model"::"blk.26.attn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.26.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.26.attn_q.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.26.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.26.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.26.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.26.attn_k.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.26.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.26.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.26.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.26.attn_v.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.26.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.26.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.26.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.26.kv_cache.quantizer:rscale"> : tensor<f32>
util.global private @"__auto.blk.26.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.26.attn_output.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.26.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.26.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @__auto.blk.26.ffn_norm.weight = #stream.parameter.named<"model"::"blk.26.ffn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.26.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.26.ffn_gate.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.26.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.26.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.26.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.26.ffn_up.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.26.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.26.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.26.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.26.ffn_down.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.26.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.26.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
util.global private @__auto.blk.27.attn_norm.weight = #stream.parameter.named<"model"::"blk.27.attn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.27.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.27.attn_q.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.27.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.27.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.27.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.27.attn_k.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.27.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.27.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.27.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.27.attn_v.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.27.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.27.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.27.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.27.kv_cache.quantizer:rscale"> : tensor<f32>
util.global private @"__auto.blk.27.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.27.attn_output.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.27.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.27.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @__auto.blk.27.ffn_norm.weight = #stream.parameter.named<"model"::"blk.27.ffn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.27.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.27.ffn_gate.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.27.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.27.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.27.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.27.ffn_up.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.27.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.27.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.27.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.27.ffn_down.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.27.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.27.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
util.global private @__auto.blk.28.attn_norm.weight = #stream.parameter.named<"model"::"blk.28.attn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.28.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.28.attn_q.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.28.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.28.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.28.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.28.attn_k.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.28.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.28.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.28.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.28.attn_v.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.28.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.28.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.28.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.28.kv_cache.quantizer:rscale"> : tensor<f32>
util.global private @"__auto.blk.28.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.28.attn_output.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.28.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.28.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @__auto.blk.28.ffn_norm.weight = #stream.parameter.named<"model"::"blk.28.ffn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.28.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.28.ffn_gate.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.28.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.28.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.28.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.28.ffn_up.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.28.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.28.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.28.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.28.ffn_down.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.28.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.28.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
util.global private @__auto.blk.29.attn_norm.weight = #stream.parameter.named<"model"::"blk.29.attn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.29.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.29.attn_q.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.29.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.29.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.29.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.29.attn_k.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.29.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.29.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.29.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.29.attn_v.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.29.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.29.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.29.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.29.kv_cache.quantizer:rscale"> : tensor<f32>
util.global private @"__auto.blk.29.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.29.attn_output.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.29.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.29.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @__auto.blk.29.ffn_norm.weight = #stream.parameter.named<"model"::"blk.29.ffn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.29.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.29.ffn_gate.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.29.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.29.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.29.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.29.ffn_up.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.29.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.29.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.29.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.29.ffn_down.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.29.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.29.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
util.global private @__auto.blk.30.attn_norm.weight = #stream.parameter.named<"model"::"blk.30.attn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.30.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.30.attn_q.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.30.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.30.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.30.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.30.attn_k.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.30.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.30.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.30.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.30.attn_v.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.30.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.30.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.30.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.30.kv_cache.quantizer:rscale"> : tensor<f32>
util.global private @"__auto.blk.30.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.30.attn_output.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.30.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.30.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @__auto.blk.30.ffn_norm.weight = #stream.parameter.named<"model"::"blk.30.ffn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.30.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.30.ffn_gate.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.30.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.30.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.30.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.30.ffn_up.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.30.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.30.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.30.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.30.ffn_down.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.30.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.30.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
util.global private @__auto.blk.31.attn_norm.weight = #stream.parameter.named<"model"::"blk.31.attn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.31.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.31.attn_q.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.31.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.31.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.31.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.31.attn_k.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.31.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.31.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.31.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.31.attn_v.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.31.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.31.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.31.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.31.kv_cache.quantizer:rscale"> : tensor<f32>
util.global private @"__auto.blk.31.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.31.attn_output.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.31.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.31.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @__auto.blk.31.ffn_norm.weight = #stream.parameter.named<"model"::"blk.31.ffn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.31.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.31.ffn_gate.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.31.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.31.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.31.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.31.ffn_up.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.31.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.31.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.31.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.31.ffn_down.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.31.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.31.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
util.global private @__auto.output_norm.weight = #stream.parameter.named<"model"::"output_norm.weight"> : tensor<4096xbf16>
util.global private @__auto.output.weight = #stream.parameter.named<"model"::"output.weight"> : tensor<128256x4096xbf16>
func.func @prefill_bs1(%arg0: !torch.vtensor<[1,?],si64>, %arg1: !torch.vtensor<[1],si64>, %arg2: !torch.vtensor<[1,?],si64>, %arg3: !torch.tensor<[?,2097152],f16>) -> !torch.vtensor<[1,?,128256],f32> attributes {torch.assume_strict_symbolic_shapes} {
%__auto.token_embd.weight = util.global.load @__auto.token_embd.weight : tensor<128256x4096xbf16>
%0 = torch_c.from_builtin_tensor %__auto.token_embd.weight : tensor<128256x4096xbf16> -> !torch.vtensor<[128256,4096],bf16>
%__auto.blk.0.attn_norm.weight = util.global.load @__auto.blk.0.attn_norm.weight : tensor<4096xbf16>
%1 = torch_c.from_builtin_tensor %__auto.blk.0.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.0.attn_q.q_input3Arscale = util.global.load @"__auto.blk.0.attn_q.q_input:rscale" : tensor<f32>
%2 = torch_c.from_builtin_tensor %__auto.blk.0.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.0.attn_q.weight3Aqs = util.global.load @"__auto.blk.0.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%3 = torch_c.from_builtin_tensor %__auto.blk.0.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.0.attn_k.q_input3Arscale = util.global.load @"__auto.blk.0.attn_k.q_input:rscale" : tensor<f32>
%4 = torch_c.from_builtin_tensor %__auto.blk.0.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.0.attn_k.weight3Aqs = util.global.load @"__auto.blk.0.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%5 = torch_c.from_builtin_tensor %__auto.blk.0.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.0.attn_v.q_input3Arscale = util.global.load @"__auto.blk.0.attn_v.q_input:rscale" : tensor<f32>
%6 = torch_c.from_builtin_tensor %__auto.blk.0.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.0.attn_v.weight3Aqs = util.global.load @"__auto.blk.0.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%7 = torch_c.from_builtin_tensor %__auto.blk.0.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.0.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.0.kv_cache.quantizer:rscale" : tensor<f32>
%8 = torch_c.from_builtin_tensor %__auto.blk.0.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.0.attn_output.q_input3Arscale = util.global.load @"__auto.blk.0.attn_output.q_input:rscale" : tensor<f32>
%9 = torch_c.from_builtin_tensor %__auto.blk.0.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.0.attn_output.weight3Aqs = util.global.load @"__auto.blk.0.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%10 = torch_c.from_builtin_tensor %__auto.blk.0.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.0.ffn_norm.weight = util.global.load @__auto.blk.0.ffn_norm.weight : tensor<4096xbf16>
%11 = torch_c.from_builtin_tensor %__auto.blk.0.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.0.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.0.ffn_gate.q_input:rscale" : tensor<f32>
%12 = torch_c.from_builtin_tensor %__auto.blk.0.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.0.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.0.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%13 = torch_c.from_builtin_tensor %__auto.blk.0.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.0.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.0.ffn_up.q_input:rscale" : tensor<f32>
%14 = torch_c.from_builtin_tensor %__auto.blk.0.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.0.ffn_up.weight3Aqs = util.global.load @"__auto.blk.0.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%15 = torch_c.from_builtin_tensor %__auto.blk.0.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.0.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.0.ffn_down.q_input:rscale" : tensor<f32>
%16 = torch_c.from_builtin_tensor %__auto.blk.0.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.0.ffn_down.weight3Aqs = util.global.load @"__auto.blk.0.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
%17 = torch_c.from_builtin_tensor %__auto.blk.0.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
%__auto.blk.1.attn_norm.weight = util.global.load @__auto.blk.1.attn_norm.weight : tensor<4096xbf16>
%18 = torch_c.from_builtin_tensor %__auto.blk.1.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.1.attn_q.q_input3Arscale = util.global.load @"__auto.blk.1.attn_q.q_input:rscale" : tensor<f32>
%19 = torch_c.from_builtin_tensor %__auto.blk.1.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.1.attn_q.weight3Aqs = util.global.load @"__auto.blk.1.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%20 = torch_c.from_builtin_tensor %__auto.blk.1.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.1.attn_k.q_input3Arscale = util.global.load @"__auto.blk.1.attn_k.q_input:rscale" : tensor<f32>
%21 = torch_c.from_builtin_tensor %__auto.blk.1.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.1.attn_k.weight3Aqs = util.global.load @"__auto.blk.1.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%22 = torch_c.from_builtin_tensor %__auto.blk.1.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.1.attn_v.q_input3Arscale = util.global.load @"__auto.blk.1.attn_v.q_input:rscale" : tensor<f32>
%23 = torch_c.from_builtin_tensor %__auto.blk.1.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.1.attn_v.weight3Aqs = util.global.load @"__auto.blk.1.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%24 = torch_c.from_builtin_tensor %__auto.blk.1.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.1.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.1.kv_cache.quantizer:rscale" : tensor<f32>
%25 = torch_c.from_builtin_tensor %__auto.blk.1.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.1.attn_output.q_input3Arscale = util.global.load @"__auto.blk.1.attn_output.q_input:rscale" : tensor<f32>
%26 = torch_c.from_builtin_tensor %__auto.blk.1.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.1.attn_output.weight3Aqs = util.global.load @"__auto.blk.1.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%27 = torch_c.from_builtin_tensor %__auto.blk.1.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.1.ffn_norm.weight = util.global.load @__auto.blk.1.ffn_norm.weight : tensor<4096xbf16>
%28 = torch_c.from_builtin_tensor %__auto.blk.1.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.1.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.1.ffn_gate.q_input:rscale" : tensor<f32>
%29 = torch_c.from_builtin_tensor %__auto.blk.1.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.1.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.1.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%30 = torch_c.from_builtin_tensor %__auto.blk.1.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.1.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.1.ffn_up.q_input:rscale" : tensor<f32>
%31 = torch_c.from_builtin_tensor %__auto.blk.1.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.1.ffn_up.weight3Aqs = util.global.load @"__auto.blk.1.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%32 = torch_c.from_builtin_tensor %__auto.blk.1.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.1.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.1.ffn_down.q_input:rscale" : tensor<f32>
%33 = torch_c.from_builtin_tensor %__auto.blk.1.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.1.ffn_down.weight3Aqs = util.global.load @"__auto.blk.1.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
%34 = torch_c.from_builtin_tensor %__auto.blk.1.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
%__auto.blk.2.attn_norm.weight = util.global.load @__auto.blk.2.attn_norm.weight : tensor<4096xbf16>
%35 = torch_c.from_builtin_tensor %__auto.blk.2.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.2.attn_q.q_input3Arscale = util.global.load @"__auto.blk.2.attn_q.q_input:rscale" : tensor<f32>
%36 = torch_c.from_builtin_tensor %__auto.blk.2.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.2.attn_q.weight3Aqs = util.global.load @"__auto.blk.2.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%37 = torch_c.from_builtin_tensor %__auto.blk.2.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.2.attn_k.q_input3Arscale = util.global.load @"__auto.blk.2.attn_k.q_input:rscale" : tensor<f32>
%38 = torch_c.from_builtin_tensor %__auto.blk.2.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.2.attn_k.weight3Aqs = util.global.load @"__auto.blk.2.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%39 = torch_c.from_builtin_tensor %__auto.blk.2.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.2.attn_v.q_input3Arscale = util.global.load @"__auto.blk.2.attn_v.q_input:rscale" : tensor<f32>
%40 = torch_c.from_builtin_tensor %__auto.blk.2.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.2.attn_v.weight3Aqs = util.global.load @"__auto.blk.2.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%41 = torch_c.from_builtin_tensor %__auto.blk.2.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.2.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.2.kv_cache.quantizer:rscale" : tensor<f32>
%42 = torch_c.from_builtin_tensor %__auto.blk.2.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.2.attn_output.q_input3Arscale = util.global.load @"__auto.blk.2.attn_output.q_input:rscale" : tensor<f32>
%43 = torch_c.from_builtin_tensor %__auto.blk.2.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.2.attn_output.weight3Aqs = util.global.load @"__auto.blk.2.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%44 = torch_c.from_builtin_tensor %__auto.blk.2.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.2.ffn_norm.weight = util.global.load @__auto.blk.2.ffn_norm.weight : tensor<4096xbf16>
%45 = torch_c.from_builtin_tensor %__auto.blk.2.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.2.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.2.ffn_gate.q_input:rscale" : tensor<f32>
%46 = torch_c.from_builtin_tensor %__auto.blk.2.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.2.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.2.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%47 = torch_c.from_builtin_tensor %__auto.blk.2.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.2.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.2.ffn_up.q_input:rscale" : tensor<f32>
%48 = torch_c.from_builtin_tensor %__auto.blk.2.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.2.ffn_up.weight3Aqs = util.global.load @"__auto.blk.2.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%49 = torch_c.from_builtin_tensor %__auto.blk.2.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.2.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.2.ffn_down.q_input:rscale" : tensor<f32>
%50 = torch_c.from_builtin_tensor %__auto.blk.2.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.2.ffn_down.weight3Aqs = util.global.load @"__auto.blk.2.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
%51 = torch_c.from_builtin_tensor %__auto.blk.2.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
%__auto.blk.3.attn_norm.weight = util.global.load @__auto.blk.3.attn_norm.weight : tensor<4096xbf16>
%52 = torch_c.from_builtin_tensor %__auto.blk.3.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.3.attn_q.q_input3Arscale = util.global.load @"__auto.blk.3.attn_q.q_input:rscale" : tensor<f32>
%53 = torch_c.from_builtin_tensor %__auto.blk.3.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.3.attn_q.weight3Aqs = util.global.load @"__auto.blk.3.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%54 = torch_c.from_builtin_tensor %__auto.blk.3.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.3.attn_k.q_input3Arscale = util.global.load @"__auto.blk.3.attn_k.q_input:rscale" : tensor<f32>
%55 = torch_c.from_builtin_tensor %__auto.blk.3.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.3.attn_k.weight3Aqs = util.global.load @"__auto.blk.3.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%56 = torch_c.from_builtin_tensor %__auto.blk.3.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.3.attn_v.q_input3Arscale = util.global.load @"__auto.blk.3.attn_v.q_input:rscale" : tensor<f32>
%57 = torch_c.from_builtin_tensor %__auto.blk.3.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.3.attn_v.weight3Aqs = util.global.load @"__auto.blk.3.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%58 = torch_c.from_builtin_tensor %__auto.blk.3.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.3.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.3.kv_cache.quantizer:rscale" : tensor<f32>
%59 = torch_c.from_builtin_tensor %__auto.blk.3.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.3.attn_output.q_input3Arscale = util.global.load @"__auto.blk.3.attn_output.q_input:rscale" : tensor<f32>
%60 = torch_c.from_builtin_tensor %__auto.blk.3.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.3.attn_output.weight3Aqs = util.global.load @"__auto.blk.3.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%61 = torch_c.from_builtin_tensor %__auto.blk.3.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.3.ffn_norm.weight = util.global.load @__auto.blk.3.ffn_norm.weight : tensor<4096xbf16>
%62 = torch_c.from_builtin_tensor %__auto.blk.3.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.3.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.3.ffn_gate.q_input:rscale" : tensor<f32>
%63 = torch_c.from_builtin_tensor %__auto.blk.3.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.3.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.3.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%64 = torch_c.from_builtin_tensor %__auto.blk.3.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.3.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.3.ffn_up.q_input:rscale" : tensor<f32>
%65 = torch_c.from_builtin_tensor %__auto.blk.3.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.3.ffn_up.weight3Aqs = util.global.load @"__auto.blk.3.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%66 = torch_c.from_builtin_tensor %__auto.blk.3.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.3.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.3.ffn_down.q_input:rscale" : tensor<f32>
%67 = torch_c.from_builtin_tensor %__auto.blk.3.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.3.ffn_down.weight3Aqs = util.global.load @"__auto.blk.3.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
%68 = torch_c.from_builtin_tensor %__auto.blk.3.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
%__auto.blk.4.attn_norm.weight = util.global.load @__auto.blk.4.attn_norm.weight : tensor<4096xbf16>
%69 = torch_c.from_builtin_tensor %__auto.blk.4.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.4.attn_q.q_input3Arscale = util.global.load @"__auto.blk.4.attn_q.q_input:rscale" : tensor<f32>
%70 = torch_c.from_builtin_tensor %__auto.blk.4.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.4.attn_q.weight3Aqs = util.global.load @"__auto.blk.4.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%71 = torch_c.from_builtin_tensor %__auto.blk.4.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.4.attn_k.q_input3Arscale = util.global.load @"__auto.blk.4.attn_k.q_input:rscale" : tensor<f32>
%72 = torch_c.from_builtin_tensor %__auto.blk.4.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.4.attn_k.weight3Aqs = util.global.load @"__auto.blk.4.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%73 = torch_c.from_builtin_tensor %__auto.blk.4.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.4.attn_v.q_input3Arscale = util.global.load @"__auto.blk.4.attn_v.q_input:rscale" : tensor<f32>
%74 = torch_c.from_builtin_tensor %__auto.blk.4.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.4.attn_v.weight3Aqs = util.global.load @"__auto.blk.4.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%75 = torch_c.from_builtin_tensor %__auto.blk.4.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.4.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.4.kv_cache.quantizer:rscale" : tensor<f32>
%76 = torch_c.from_builtin_tensor %__auto.blk.4.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.4.attn_output.q_input3Arscale = util.global.load @"__auto.blk.4.attn_output.q_input:rscale" : tensor<f32>
%77 = torch_c.from_builtin_tensor %__auto.blk.4.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.4.attn_output.weight3Aqs = util.global.load @"__auto.blk.4.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%78 = torch_c.from_builtin_tensor %__auto.blk.4.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.4.ffn_norm.weight = util.global.load @__auto.blk.4.ffn_norm.weight : tensor<4096xbf16>
%79 = torch_c.from_builtin_tensor %__auto.blk.4.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.4.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.4.ffn_gate.q_input:rscale" : tensor<f32>
%80 = torch_c.from_builtin_tensor %__auto.blk.4.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.4.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.4.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%81 = torch_c.from_builtin_tensor %__auto.blk.4.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.4.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.4.ffn_up.q_input:rscale" : tensor<f32>
%82 = torch_c.from_builtin_tensor %__auto.blk.4.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.4.ffn_up.weight3Aqs = util.global.load @"__auto.blk.4.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%83 = torch_c.from_builtin_tensor %__auto.blk.4.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.4.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.4.ffn_down.q_input:rscale" : tensor<f32>
%84 = torch_c.from_builtin_tensor %__auto.blk.4.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.4.ffn_down.weight3Aqs = util.global.load @"__auto.blk.4.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
%85 = torch_c.from_builtin_tensor %__auto.blk.4.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
%__auto.blk.5.attn_norm.weight = util.global.load @__auto.blk.5.attn_norm.weight : tensor<4096xbf16>
%86 = torch_c.from_builtin_tensor %__auto.blk.5.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.5.attn_q.q_input3Arscale = util.global.load @"__auto.blk.5.attn_q.q_input:rscale" : tensor<f32>
%87 = torch_c.from_builtin_tensor %__auto.blk.5.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.5.attn_q.weight3Aqs = util.global.load @"__auto.blk.5.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%88 = torch_c.from_builtin_tensor %__auto.blk.5.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.5.attn_k.q_input3Arscale = util.global.load @"__auto.blk.5.attn_k.q_input:rscale" : tensor<f32>
%89 = torch_c.from_builtin_tensor %__auto.blk.5.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.5.attn_k.weight3Aqs = util.global.load @"__auto.blk.5.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%90 = torch_c.from_builtin_tensor %__auto.blk.5.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.5.attn_v.q_input3Arscale = util.global.load @"__auto.blk.5.attn_v.q_input:rscale" : tensor<f32>
%91 = torch_c.from_builtin_tensor %__auto.blk.5.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.5.attn_v.weight3Aqs = util.global.load @"__auto.blk.5.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%92 = torch_c.from_builtin_tensor %__auto.blk.5.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.5.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.5.kv_cache.quantizer:rscale" : tensor<f32>
%93 = torch_c.from_builtin_tensor %__auto.blk.5.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.5.attn_output.q_input3Arscale = util.global.load @"__auto.blk.5.attn_output.q_input:rscale" : tensor<f32>
%94 = torch_c.from_builtin_tensor %__auto.blk.5.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.5.attn_output.weight3Aqs = util.global.load @"__auto.blk.5.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%95 = torch_c.from_builtin_tensor %__auto.blk.5.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.5.ffn_norm.weight = util.global.load @__auto.blk.5.ffn_norm.weight : tensor<4096xbf16>
%96 = torch_c.from_builtin_tensor %__auto.blk.5.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.5.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.5.ffn_gate.q_input:rscale" : tensor<f32>
%97 = torch_c.from_builtin_tensor %__auto.blk.5.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.5.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.5.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%98 = torch_c.from_builtin_tensor %__auto.blk.5.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.5.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.5.ffn_up.q_input:rscale" : tensor<f32>
%99 = torch_c.from_builtin_tensor %__auto.blk.5.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.5.ffn_up.weight3Aqs = util.global.load @"__auto.blk.5.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%100 = torch_c.from_builtin_tensor %__auto.blk.5.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.5.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.5.ffn_down.q_input:rscale" : tensor<f32>
%101 = torch_c.from_builtin_tensor %__auto.blk.5.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.5.ffn_down.weight3Aqs = util.global.load @"__auto.blk.5.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
%102 = torch_c.from_builtin_tensor %__auto.blk.5.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
%__auto.blk.6.attn_norm.weight = util.global.load @__auto.blk.6.attn_norm.weight : tensor<4096xbf16>
%103 = torch_c.from_builtin_tensor %__auto.blk.6.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.6.attn_q.q_input3Arscale = util.global.load @"__auto.blk.6.attn_q.q_input:rscale" : tensor<f32>
%104 = torch_c.from_builtin_tensor %__auto.blk.6.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.6.attn_q.weight3Aqs = util.global.load @"__auto.blk.6.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%105 = torch_c.from_builtin_tensor %__auto.blk.6.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.6.attn_k.q_input3Arscale = util.global.load @"__auto.blk.6.attn_k.q_input:rscale" : tensor<f32>
%106 = torch_c.from_builtin_tensor %__auto.blk.6.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.6.attn_k.weight3Aqs = util.global.load @"__auto.blk.6.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%107 = torch_c.from_builtin_tensor %__auto.blk.6.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.6.attn_v.q_input3Arscale = util.global.load @"__auto.blk.6.attn_v.q_input:rscale" : tensor<f32>
%108 = torch_c.from_builtin_tensor %__auto.blk.6.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.6.attn_v.weight3Aqs = util.global.load @"__auto.blk.6.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%109 = torch_c.from_builtin_tensor %__auto.blk.6.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.6.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.6.kv_cache.quantizer:rscale" : tensor<f32>
%110 = torch_c.from_builtin_tensor %__auto.blk.6.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.6.attn_output.q_input3Arscale = util.global.load @"__auto.blk.6.attn_output.q_input:rscale" : tensor<f32>
%111 = torch_c.from_builtin_tensor %__auto.blk.6.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.6.attn_output.weight3Aqs = util.global.load @"__auto.blk.6.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%112 = torch_c.from_builtin_tensor %__auto.blk.6.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.6.ffn_norm.weight = util.global.load @__auto.blk.6.ffn_norm.weight : tensor<4096xbf16>
%113 = torch_c.from_builtin_tensor %__auto.blk.6.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.6.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.6.ffn_gate.q_input:rscale" : tensor<f32>
%114 = torch_c.from_builtin_tensor %__auto.blk.6.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.6.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.6.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%115 = torch_c.from_builtin_tensor %__auto.blk.6.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.6.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.6.ffn_up.q_input:rscale" : tensor<f32>
%116 = torch_c.from_builtin_tensor %__auto.blk.6.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.6.ffn_up.weight3Aqs = util.global.load @"__auto.blk.6.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%117 = torch_c.from_builtin_tensor %__auto.blk.6.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.6.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.6.ffn_down.q_input:rscale" : tensor<f32>
%118 = torch_c.from_builtin_tensor %__auto.blk.6.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.6.ffn_down.weight3Aqs = util.global.load @"__auto.blk.6.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
%119 = torch_c.from_builtin_tensor %__auto.blk.6.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
%__auto.blk.7.attn_norm.weight = util.global.load @__auto.blk.7.attn_norm.weight : tensor<4096xbf16>
%120 = torch_c.from_builtin_tensor %__auto.blk.7.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.7.attn_q.q_input3Arscale = util.global.load @"__auto.blk.7.attn_q.q_input:rscale" : tensor<f32>
%121 = torch_c.from_builtin_tensor %__auto.blk.7.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.7.attn_q.weight3Aqs = util.global.load @"__auto.blk.7.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%122 = torch_c.from_builtin_tensor %__auto.blk.7.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.7.attn_k.q_input3Arscale = util.global.load @"__auto.blk.7.attn_k.q_input:rscale" : tensor<f32>
%123 = torch_c.from_builtin_tensor %__auto.blk.7.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.7.attn_k.weight3Aqs = util.global.load @"__auto.blk.7.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%124 = torch_c.from_builtin_tensor %__auto.blk.7.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.7.attn_v.q_input3Arscale = util.global.load @"__auto.blk.7.attn_v.q_input:rscale" : tensor<f32>
%125 = torch_c.from_builtin_tensor %__auto.blk.7.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.7.attn_v.weight3Aqs = util.global.load @"__auto.blk.7.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%126 = torch_c.from_builtin_tensor %__auto.blk.7.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.7.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.7.kv_cache.quantizer:rscale" : tensor<f32>
%127 = torch_c.from_builtin_tensor %__auto.blk.7.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.7.attn_output.q_input3Arscale = util.global.load @"__auto.blk.7.attn_output.q_input:rscale" : tensor<f32>
%128 = torch_c.from_builtin_tensor %__auto.blk.7.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.7.attn_output.weight3Aqs = util.global.load @"__auto.blk.7.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%129 = torch_c.from_builtin_tensor %__auto.blk.7.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.7.ffn_norm.weight = util.global.load @__auto.blk.7.ffn_norm.weight : tensor<4096xbf16>
%130 = torch_c.from_builtin_tensor %__auto.blk.7.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.7.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.7.ffn_gate.q_input:rscale" : tensor<f32>
%131 = torch_c.from_builtin_tensor %__auto.blk.7.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.7.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.7.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%132 = torch_c.from_builtin_tensor %__auto.blk.7.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.7.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.7.ffn_up.q_input:rscale" : tensor<f32>
%133 = torch_c.from_builtin_tensor %__auto.blk.7.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.7.ffn_up.weight3Aqs = util.global.load @"__auto.blk.7.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%134 = torch_c.from_builtin_tensor %__auto.blk.7.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.7.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.7.ffn_down.q_input:rscale" : tensor<f32>
%135 = torch_c.from_builtin_tensor %__auto.blk.7.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.7.ffn_down.weight3Aqs = util.global.load @"__auto.blk.7.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
%136 = torch_c.from_builtin_tensor %__auto.blk.7.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
%__auto.blk.8.attn_norm.weight = util.global.load @__auto.blk.8.attn_norm.weight : tensor<4096xbf16>
%137 = torch_c.from_builtin_tensor %__auto.blk.8.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.8.attn_q.q_input3Arscale = util.global.load @"__auto.blk.8.attn_q.q_input:rscale" : tensor<f32>
%138 = torch_c.from_builtin_tensor %__auto.blk.8.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.8.attn_q.weight3Aqs = util.global.load @"__auto.blk.8.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%139 = torch_c.from_builtin_tensor %__auto.blk.8.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.8.attn_k.q_input3Arscale = util.global.load @"__auto.blk.8.attn_k.q_input:rscale" : tensor<f32>
%140 = torch_c.from_builtin_tensor %__auto.blk.8.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.8.attn_k.weight3Aqs = util.global.load @"__auto.blk.8.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%141 = torch_c.from_builtin_tensor %__auto.blk.8.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.8.attn_v.q_input3Arscale = util.global.load @"__auto.blk.8.attn_v.q_input:rscale" : tensor<f32>
%142 = torch_c.from_builtin_tensor %__auto.blk.8.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.8.attn_v.weight3Aqs = util.global.load @"__auto.blk.8.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%143 = torch_c.from_builtin_tensor %__auto.blk.8.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.8.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.8.kv_cache.quantizer:rscale" : tensor<f32>
%144 = torch_c.from_builtin_tensor %__auto.blk.8.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.8.attn_output.q_input3Arscale = util.global.load @"__auto.blk.8.attn_output.q_input:rscale" : tensor<f32>
%145 = torch_c.from_builtin_tensor %__auto.blk.8.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.8.attn_output.weight3Aqs = util.global.load @"__auto.blk.8.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%146 = torch_c.from_builtin_tensor %__auto.blk.8.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.8.ffn_norm.weight = util.global.load @__auto.blk.8.ffn_norm.weight : tensor<4096xbf16>
%147 = torch_c.from_builtin_tensor %__auto.blk.8.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.8.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.8.ffn_gate.q_input:rscale" : tensor<f32>
%148 = torch_c.from_builtin_tensor %__auto.blk.8.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.8.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.8.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%149 = torch_c.from_builtin_tensor %__auto.blk.8.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.8.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.8.ffn_up.q_input:rscale" : tensor<f32>
%150 = torch_c.from_builtin_tensor %__auto.blk.8.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.8.ffn_up.weight3Aqs = util.global.load @"__auto.blk.8.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%151 = torch_c.from_builtin_tensor %__auto.blk.8.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.8.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.8.ffn_down.q_input:rscale" : tensor<f32>
%152 = torch_c.from_builtin_tensor %__auto.blk.8.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.8.ffn_down.weight3Aqs = util.global.load @"__auto.blk.8.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
%153 = torch_c.from_builtin_tensor %__auto.blk.8.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
%__auto.blk.9.attn_norm.weight = util.global.load @__auto.blk.9.attn_norm.weight : tensor<4096xbf16>
%154 = torch_c.from_builtin_tensor %__auto.blk.9.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.9.attn_q.q_input3Arscale = util.global.load @"__auto.blk.9.attn_q.q_input:rscale" : tensor<f32>
%155 = torch_c.from_builtin_tensor %__auto.blk.9.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.9.attn_q.weight3Aqs = util.global.load @"__auto.blk.9.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%156 = torch_c.from_builtin_tensor %__auto.blk.9.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.9.attn_k.q_input3Arscale = util.global.load @"__auto.blk.9.attn_k.q_input:rscale" : tensor<f32>
%157 = torch_c.from_builtin_tensor %__auto.blk.9.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.9.attn_k.weight3Aqs = util.global.load @"__auto.blk.9.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%158 = torch_c.from_builtin_tensor %__auto.blk.9.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.9.attn_v.q_input3Arscale = util.global.load @"__auto.blk.9.attn_v.q_input:rscale" : tensor<f32>
%159 = torch_c.from_builtin_tensor %__auto.blk.9.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.9.attn_v.weight3Aqs = util.global.load @"__auto.blk.9.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%160 = torch_c.from_builtin_tensor %__auto.blk.9.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.9.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.9.kv_cache.quantizer:rscale" : tensor<f32>
%161 = torch_c.from_builtin_tensor %__auto.blk.9.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.9.attn_output.q_input3Arscale = util.global.load @"__auto.blk.9.attn_output.q_input:rscale" : tensor<f32>
%162 = torch_c.from_builtin_tensor %__auto.blk.9.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.9.attn_output.weight3Aqs = util.global.load @"__auto.blk.9.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%163 = torch_c.from_builtin_tensor %__auto.blk.9.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.9.ffn_norm.weight = util.global.load @__auto.blk.9.ffn_norm.weight : tensor<4096xbf16>
%164 = torch_c.from_builtin_tensor %__auto.blk.9.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.9.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.9.ffn_gate.q_input:rscale" : tensor<f32>
%165 = torch_c.from_builtin_tensor %__auto.blk.9.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.9.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.9.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%166 = torch_c.from_builtin_tensor %__auto.blk.9.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.9.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.9.ffn_up.q_input:rscale" : tensor<f32>
%167 = torch_c.from_builtin_tensor %__auto.blk.9.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.9.ffn_up.weight3Aqs = util.global.load @"__auto.blk.9.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%168 = torch_c.from_builtin_tensor %__auto.blk.9.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.9.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.9.ffn_down.q_input:rscale" : tensor<f32>
%169 = torch_c.from_builtin_tensor %__auto.blk.9.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.9.ffn_down.weight3Aqs = util.global.load @"__auto.blk.9.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
%170 = torch_c.from_builtin_tensor %__auto.blk.9.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
%__auto.blk.10.attn_norm.weight = util.global.load @__auto.blk.10.attn_norm.weight : tensor<4096xbf16>
%171 = torch_c.from_builtin_tensor %__auto.blk.10.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.10.attn_q.q_input3Arscale = util.global.load @"__auto.blk.10.attn_q.q_input:rscale" : tensor<f32>
%172 = torch_c.from_builtin_tensor %__auto.blk.10.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.10.attn_q.weight3Aqs = util.global.load @"__auto.blk.10.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%173 = torch_c.from_builtin_tensor %__auto.blk.10.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.10.attn_k.q_input3Arscale = util.global.load @"__auto.blk.10.attn_k.q_input:rscale" : tensor<f32>
%174 = torch_c.from_builtin_tensor %__auto.blk.10.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.10.attn_k.weight3Aqs = util.global.load @"__auto.blk.10.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%175 = torch_c.from_builtin_tensor %__auto.blk.10.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.10.attn_v.q_input3Arscale = util.global.load @"__auto.blk.10.attn_v.q_input:rscale" : tensor<f32>
%176 = torch_c.from_builtin_tensor %__auto.blk.10.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.10.attn_v.weight3Aqs = util.global.load @"__auto.blk.10.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%177 = torch_c.from_builtin_tensor %__auto.blk.10.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.10.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.10.kv_cache.quantizer:rscale" : tensor<f32>
%178 = torch_c.from_builtin_tensor %__auto.blk.10.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.10.attn_output.q_input3Arscale = util.global.load @"__auto.blk.10.attn_output.q_input:rscale" : tensor<f32>
%179 = torch_c.from_builtin_tensor %__auto.blk.10.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.10.attn_output.weight3Aqs = util.global.load @"__auto.blk.10.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%180 = torch_c.from_builtin_tensor %__auto.blk.10.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.10.ffn_norm.weight = util.global.load @__auto.blk.10.ffn_norm.weight : tensor<4096xbf16>
%181 = torch_c.from_builtin_tensor %__auto.blk.10.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.10.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.10.ffn_gate.q_input:rscale" : tensor<f32>
%182 = torch_c.from_builtin_tensor %__auto.blk.10.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.10.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.10.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%183 = torch_c.from_builtin_tensor %__auto.blk.10.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.10.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.10.ffn_up.q_input:rscale" : tensor<f32>
%184 = torch_c.from_builtin_tensor %__auto.blk.10.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.10.ffn_up.weight3Aqs = util.global.load @"__auto.blk.10.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%185 = torch_c.from_builtin_tensor %__auto.blk.10.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.10.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.10.ffn_down.q_input:rscale" : tensor<f32>
%186 = torch_c.from_builtin_tensor %__auto.blk.10.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.10.ffn_down.weight3Aqs = util.global.load @"__auto.blk.10.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
%187 = torch_c.from_builtin_tensor %__auto.blk.10.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
%__auto.blk.11.attn_norm.weight = util.global.load @__auto.blk.11.attn_norm.weight : tensor<4096xbf16>
%188 = torch_c.from_builtin_tensor %__auto.blk.11.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.11.attn_q.q_input3Arscale = util.global.load @"__auto.blk.11.attn_q.q_input:rscale" : tensor<f32>
%189 = torch_c.from_builtin_tensor %__auto.blk.11.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.11.attn_q.weight3Aqs = util.global.load @"__auto.blk.11.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%190 = torch_c.from_builtin_tensor %__auto.blk.11.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.11.attn_k.q_input3Arscale = util.global.load @"__auto.blk.11.attn_k.q_input:rscale" : tensor<f32>
%191 = torch_c.from_builtin_tensor %__auto.blk.11.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.11.attn_k.weight3Aqs = util.global.load @"__auto.blk.11.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%192 = torch_c.from_builtin_tensor %__auto.blk.11.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.11.attn_v.q_input3Arscale = util.global.load @"__auto.blk.11.attn_v.q_input:rscale" : tensor<f32>
%193 = torch_c.from_builtin_tensor %__auto.blk.11.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.11.attn_v.weight3Aqs = util.global.load @"__auto.blk.11.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%194 = torch_c.from_builtin_tensor %__auto.blk.11.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.11.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.11.kv_cache.quantizer:rscale" : tensor<f32>
%195 = torch_c.from_builtin_tensor %__auto.blk.11.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.11.attn_output.q_input3Arscale = util.global.load @"__auto.blk.11.attn_output.q_input:rscale" : tensor<f32>
%196 = torch_c.from_builtin_tensor %__auto.blk.11.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.11.attn_output.weight3Aqs = util.global.load @"__auto.blk.11.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%197 = torch_c.from_builtin_tensor %__auto.blk.11.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.11.ffn_norm.weight = util.global.load @__auto.blk.11.ffn_norm.weight : tensor<4096xbf16>
%198 = torch_c.from_builtin_tensor %__auto.blk.11.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.11.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.11.ffn_gate.q_input:rscale" : tensor<f32>
%199 = torch_c.from_builtin_tensor %__auto.blk.11.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.11.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.11.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%200 = torch_c.from_builtin_tensor %__auto.blk.11.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.11.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.11.ffn_up.q_input:rscale" : tensor<f32>
%201 = torch_c.from_builtin_tensor %__auto.blk.11.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.11.ffn_up.weight3Aqs = util.global.load @"__auto.blk.11.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%202 = torch_c.from_builtin_tensor %__auto.blk.11.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.11.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.11.ffn_down.q_input:rscale" : tensor<f32>
%203 = torch_c.from_builtin_tensor %__auto.blk.11.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.11.ffn_down.weight3Aqs = util.global.load @"__auto.blk.11.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
%204 = torch_c.from_builtin_tensor %__auto.blk.11.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
%__auto.blk.12.attn_norm.weight = util.global.load @__auto.blk.12.attn_norm.weight : tensor<4096xbf16>
%205 = torch_c.from_builtin_tensor %__auto.blk.12.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.12.attn_q.q_input3Arscale = util.global.load @"__auto.blk.12.attn_q.q_input:rscale" : tensor<f32>
%206 = torch_c.from_builtin_tensor %__auto.blk.12.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.12.attn_q.weight3Aqs = util.global.load @"__auto.blk.12.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%207 = torch_c.from_builtin_tensor %__auto.blk.12.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.12.attn_k.q_input3Arscale = util.global.load @"__auto.blk.12.attn_k.q_input:rscale" : tensor<f32>
%208 = torch_c.from_builtin_tensor %__auto.blk.12.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.12.attn_k.weight3Aqs = util.global.load @"__auto.blk.12.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%209 = torch_c.from_builtin_tensor %__auto.blk.12.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.12.attn_v.q_input3Arscale = util.global.load @"__auto.blk.12.attn_v.q_input:rscale" : tensor<f32>
%210 = torch_c.from_builtin_tensor %__auto.blk.12.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.12.attn_v.weight3Aqs = util.global.load @"__auto.blk.12.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%211 = torch_c.from_builtin_tensor %__auto.blk.12.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.12.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.12.kv_cache.quantizer:rscale" : tensor<f32>
%212 = torch_c.from_builtin_tensor %__auto.blk.12.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.12.attn_output.q_input3Arscale = util.global.load @"__auto.blk.12.attn_output.q_input:rscale" : tensor<f32>
%213 = torch_c.from_builtin_tensor %__auto.blk.12.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.12.attn_output.weight3Aqs = util.global.load @"__auto.blk.12.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%214 = torch_c.from_builtin_tensor %__auto.blk.12.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.12.ffn_norm.weight = util.global.load @__auto.blk.12.ffn_norm.weight : tensor<4096xbf16>
%215 = torch_c.from_builtin_tensor %__auto.blk.12.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.12.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.12.ffn_gate.q_input:rscale" : tensor<f32>
%216 = torch_c.from_builtin_tensor %__auto.blk.12.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.12.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.12.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%217 = torch_c.from_builtin_tensor %__auto.blk.12.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.12.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.12.ffn_up.q_input:rscale" : tensor<f32>
%218 = torch_c.from_builtin_tensor %__auto.blk.12.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.12.ffn_up.weight3Aqs = util.global.load @"__auto.blk.12.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%219 = torch_c.from_builtin_tensor %__auto.blk.12.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.12.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.12.ffn_down.q_input:rscale" : tensor<f32>
%220 = torch_c.from_builtin_tensor %__auto.blk.12.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.12.ffn_down.weight3Aqs = util.global.load @"__auto.blk.12.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
%221 = torch_c.from_builtin_tensor %__auto.blk.12.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
%__auto.blk.13.attn_norm.weight = util.global.load @__auto.blk.13.attn_norm.weight : tensor<4096xbf16>
%222 = torch_c.from_builtin_tensor %__auto.blk.13.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.13.attn_q.q_input3Arscale = util.global.load @"__auto.blk.13.attn_q.q_input:rscale" : tensor<f32>
%223 = torch_c.from_builtin_tensor %__auto.blk.13.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.13.attn_q.weight3Aqs = util.global.load @"__auto.blk.13.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%224 = torch_c.from_builtin_tensor %__auto.blk.13.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.13.attn_k.q_input3Arscale = util.global.load @"__auto.blk.13.attn_k.q_input:rscale" : tensor<f32>
%225 = torch_c.from_builtin_tensor %__auto.blk.13.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.13.attn_k.weight3Aqs = util.global.load @"__auto.blk.13.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%226 = torch_c.from_builtin_tensor %__auto.blk.13.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.13.attn_v.q_input3Arscale = util.global.load @"__auto.blk.13.attn_v.q_input:rscale" : tensor<f32>
%227 = torch_c.from_builtin_tensor %__auto.blk.13.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.13.attn_v.weight3Aqs = util.global.load @"__auto.blk.13.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%228 = torch_c.from_builtin_tensor %__auto.blk.13.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.13.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.13.kv_cache.quantizer:rscale" : tensor<f32>
%229 = torch_c.from_builtin_tensor %__auto.blk.13.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.13.attn_output.q_input3Arscale = util.global.load @"__auto.blk.13.attn_output.q_input:rscale" : tensor<f32>
%230 = torch_c.from_builtin_tensor %__auto.blk.13.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.13.attn_output.weight3Aqs = util.global.load @"__auto.blk.13.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%231 = torch_c.from_builtin_tensor %__auto.blk.13.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.13.ffn_norm.weight = util.global.load @__auto.blk.13.ffn_norm.weight : tensor<4096xbf16>
%232 = torch_c.from_builtin_tensor %__auto.blk.13.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.13.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.13.ffn_gate.q_input:rscale" : tensor<f32>
%233 = torch_c.from_builtin_tensor %__auto.blk.13.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.13.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.13.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%234 = torch_c.from_builtin_tensor %__auto.blk.13.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.13.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.13.ffn_up.q_input:rscale" : tensor<f32>
%235 = torch_c.from_builtin_tensor %__auto.blk.13.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.13.ffn_up.weight3Aqs = util.global.load @"__auto.blk.13.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%236 = torch_c.from_builtin_tensor %__auto.blk.13.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.13.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.13.ffn_down.q_input:rscale" : tensor<f32>
%237 = torch_c.from_builtin_tensor %__auto.blk.13.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.13.ffn_down.weight3Aqs = util.global.load @"__auto.blk.13.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
%238 = torch_c.from_builtin_tensor %__auto.blk.13.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
%__auto.blk.14.attn_norm.weight = util.global.load @__auto.blk.14.attn_norm.weight : tensor<4096xbf16>
%239 = torch_c.from_builtin_tensor %__auto.blk.14.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.14.attn_q.q_input3Arscale = util.global.load @"__auto.blk.14.attn_q.q_input:rscale" : tensor<f32>
%240 = torch_c.from_builtin_tensor %__auto.blk.14.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.14.attn_q.weight3Aqs = util.global.load @"__auto.blk.14.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%241 = torch_c.from_builtin_tensor %__auto.blk.14.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.14.attn_k.q_input3Arscale = util.global.load @"__auto.blk.14.attn_k.q_input:rscale" : tensor<f32>
%242 = torch_c.from_builtin_tensor %__auto.blk.14.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.14.attn_k.weight3Aqs = util.global.load @"__auto.blk.14.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%243 = torch_c.from_builtin_tensor %__auto.blk.14.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.14.attn_v.q_input3Arscale = util.global.load @"__auto.blk.14.attn_v.q_input:rscale" : tensor<f32>
%244 = torch_c.from_builtin_tensor %__auto.blk.14.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.14.attn_v.weight3Aqs = util.global.load @"__auto.blk.14.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%245 = torch_c.from_builtin_tensor %__auto.blk.14.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.14.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.14.kv_cache.quantizer:rscale" : tensor<f32>
%246 = torch_c.from_builtin_tensor %__auto.blk.14.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.14.attn_output.q_input3Arscale = util.global.load @"__auto.blk.14.attn_output.q_input:rscale" : tensor<f32>
%247 = torch_c.from_builtin_tensor %__auto.blk.14.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.14.attn_output.weight3Aqs = util.global.load @"__auto.blk.14.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%248 = torch_c.from_builtin_tensor %__auto.blk.14.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.14.ffn_norm.weight = util.global.load @__auto.blk.14.ffn_norm.weight : tensor<4096xbf16>
%249 = torch_c.from_builtin_tensor %__auto.blk.14.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.14.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.14.ffn_gate.q_input:rscale" : tensor<f32>
%250 = torch_c.from_builtin_tensor %__auto.blk.14.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.14.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.14.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%251 = torch_c.from_builtin_tensor %__auto.blk.14.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.14.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.14.ffn_up.q_input:rscale" : tensor<f32>
%252 = torch_c.from_builtin_tensor %__auto.blk.14.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.14.ffn_up.weight3Aqs = util.global.load @"__auto.blk.14.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%253 = torch_c.from_builtin_tensor %__auto.blk.14.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.14.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.14.ffn_down.q_input:rscale" : tensor<f32>
%254 = torch_c.from_builtin_tensor %__auto.blk.14.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.14.ffn_down.weight3Aqs = util.global.load @"__auto.blk.14.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
%255 = torch_c.from_builtin_tensor %__auto.blk.14.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
%__auto.blk.15.attn_norm.weight = util.global.load @__auto.blk.15.attn_norm.weight : tensor<4096xbf16>
%256 = torch_c.from_builtin_tensor %__auto.blk.15.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.15.attn_q.q_input3Arscale = util.global.load @"__auto.blk.15.attn_q.q_input:rscale" : tensor<f32>
%257 = torch_c.from_builtin_tensor %__auto.blk.15.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.15.attn_q.weight3Aqs = util.global.load @"__auto.blk.15.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%258 = torch_c.from_builtin_tensor %__auto.blk.15.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.15.attn_k.q_input3Arscale = util.global.load @"__auto.blk.15.attn_k.q_input:rscale" : tensor<f32>
%259 = torch_c.from_builtin_tensor %__auto.blk.15.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.15.attn_k.weight3Aqs = util.global.load @"__auto.blk.15.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%260 = torch_c.from_builtin_tensor %__auto.blk.15.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.15.attn_v.q_input3Arscale = util.global.load @"__auto.blk.15.attn_v.q_input:rscale" : tensor<f32>
%261 = torch_c.from_builtin_tensor %__auto.blk.15.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.15.attn_v.weight3Aqs = util.global.load @"__auto.blk.15.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%262 = torch_c.from_builtin_tensor %__auto.blk.15.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.15.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.15.kv_cache.quantizer:rscale" : tensor<f32>
%263 = torch_c.from_builtin_tensor %__auto.blk.15.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.15.attn_output.q_input3Arscale = util.global.load @"__auto.blk.15.attn_output.q_input:rscale" : tensor<f32>
%264 = torch_c.from_builtin_tensor %__auto.blk.15.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.15.attn_output.weight3Aqs = util.global.load @"__auto.blk.15.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%265 = torch_c.from_builtin_tensor %__auto.blk.15.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.15.ffn_norm.weight = util.global.load @__auto.blk.15.ffn_norm.weight : tensor<4096xbf16>
%266 = torch_c.from_builtin_tensor %__auto.blk.15.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.15.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.15.ffn_gate.q_input:rscale" : tensor<f32>
%267 = torch_c.from_builtin_tensor %__auto.blk.15.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.15.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.15.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%268 = torch_c.from_builtin_tensor %__auto.blk.15.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.15.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.15.ffn_up.q_input:rscale" : tensor<f32>
%269 = torch_c.from_builtin_tensor %__auto.blk.15.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.15.ffn_up.weight3Aqs = util.global.load @"__auto.blk.15.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%270 = torch_c.from_builtin_tensor %__auto.blk.15.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.15.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.15.ffn_down.q_input:rscale" : tensor<f32>
%271 = torch_c.from_builtin_tensor %__auto.blk.15.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.15.ffn_down.weight3Aqs = util.global.load @"__auto.blk.15.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
%272 = torch_c.from_builtin_tensor %__auto.blk.15.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
%__auto.blk.16.attn_norm.weight = util.global.load @__auto.blk.16.attn_norm.weight : tensor<4096xbf16>
%273 = torch_c.from_builtin_tensor %__auto.blk.16.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.16.attn_q.q_input3Arscale = util.global.load @"__auto.blk.16.attn_q.q_input:rscale" : tensor<f32>
%274 = torch_c.from_builtin_tensor %__auto.blk.16.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.16.attn_q.weight3Aqs = util.global.load @"__auto.blk.16.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%275 = torch_c.from_builtin_tensor %__auto.blk.16.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.16.attn_k.q_input3Arscale = util.global.load @"__auto.blk.16.attn_k.q_input:rscale" : tensor<f32>
%276 = torch_c.from_builtin_tensor %__auto.blk.16.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.16.attn_k.weight3Aqs = util.global.load @"__auto.blk.16.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%277 = torch_c.from_builtin_tensor %__auto.blk.16.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.16.attn_v.q_input3Arscale = util.global.load @"__auto.blk.16.attn_v.q_input:rscale" : tensor<f32>
%278 = torch_c.from_builtin_tensor %__auto.blk.16.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.16.attn_v.weight3Aqs = util.global.load @"__auto.blk.16.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%279 = torch_c.from_builtin_tensor %__auto.blk.16.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.16.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.16.kv_cache.quantizer:rscale" : tensor<f32>
%280 = torch_c.from_builtin_tensor %__auto.blk.16.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.16.attn_output.q_input3Arscale = util.global.load @"__auto.blk.16.attn_output.q_input:rscale" : tensor<f32>
%281 = torch_c.from_builtin_tensor %__auto.blk.16.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.16.attn_output.weight3Aqs = util.global.load @"__auto.blk.16.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%282 = torch_c.from_builtin_tensor %__auto.blk.16.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.16.ffn_norm.weight = util.global.load @__auto.blk.16.ffn_norm.weight : tensor<4096xbf16>
%283 = torch_c.from_builtin_tensor %__auto.blk.16.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.16.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.16.ffn_gate.q_input:rscale" : tensor<f32>
%284 = torch_c.from_builtin_tensor %__auto.blk.16.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.16.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.16.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%285 = torch_c.from_builtin_tensor %__auto.blk.16.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.16.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.16.ffn_up.q_input:rscale" : tensor<f32>
%286 = torch_c.from_builtin_tensor %__auto.blk.16.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.16.ffn_up.weight3Aqs = util.global.load @"__auto.blk.16.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%287 = torch_c.from_builtin_tensor %__auto.blk.16.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.16.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.16.ffn_down.q_input:rscale" : tensor<f32>
%288 = torch_c.from_builtin_tensor %__auto.blk.16.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.16.ffn_down.weight3Aqs = util.global.load @"__auto.blk.16.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
%289 = torch_c.from_builtin_tensor %__auto.blk.16.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
%__auto.blk.17.attn_norm.weight = util.global.load @__auto.blk.17.attn_norm.weight : tensor<4096xbf16>
%290 = torch_c.from_builtin_tensor %__auto.blk.17.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.17.attn_q.q_input3Arscale = util.global.load @"__auto.blk.17.attn_q.q_input:rscale" : tensor<f32>
%291 = torch_c.from_builtin_tensor %__auto.blk.17.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.17.attn_q.weight3Aqs = util.global.load @"__auto.blk.17.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%292 = torch_c.from_builtin_tensor %__auto.blk.17.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.17.attn_k.q_input3Arscale = util.global.load @"__auto.blk.17.attn_k.q_input:rscale" : tensor<f32>
%293 = torch_c.from_builtin_tensor %__auto.blk.17.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.17.attn_k.weight3Aqs = util.global.load @"__auto.blk.17.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%294 = torch_c.from_builtin_tensor %__auto.blk.17.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.17.attn_v.q_input3Arscale = util.global.load @"__auto.blk.17.attn_v.q_input:rscale" : tensor<f32>
%295 = torch_c.from_builtin_tensor %__auto.blk.17.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.17.attn_v.weight3Aqs = util.global.load @"__auto.blk.17.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%296 = torch_c.from_builtin_tensor %__auto.blk.17.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.17.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.17.kv_cache.quantizer:rscale" : tensor<f32>
%297 = torch_c.from_builtin_tensor %__auto.blk.17.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.17.attn_output.q_input3Arscale = util.global.load @"__auto.blk.17.attn_output.q_input:rscale" : tensor<f32>
%298 = torch_c.from_builtin_tensor %__auto.blk.17.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.17.attn_output.weight3Aqs = util.global.load @"__auto.blk.17.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%299 = torch_c.from_builtin_tensor %__auto.blk.17.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.17.ffn_norm.weight = util.global.load @__auto.blk.17.ffn_norm.weight : tensor<4096xbf16>
%300 = torch_c.from_builtin_tensor %__auto.blk.17.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.17.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.17.ffn_gate.q_input:rscale" : tensor<f32>
%301 = torch_c.from_builtin_tensor %__auto.blk.17.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.17.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.17.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%302 = torch_c.from_builtin_tensor %__auto.blk.17.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.17.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.17.ffn_up.q_input:rscale" : tensor<f32>
%303 = torch_c.from_builtin_tensor %__auto.blk.17.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.17.ffn_up.weight3Aqs = util.global.load @"__auto.blk.17.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%304 = torch_c.from_builtin_tensor %__auto.blk.17.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.17.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.17.ffn_down.q_input:rscale" : tensor<f32>
%305 = torch_c.from_builtin_tensor %__auto.blk.17.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.17.ffn_down.weight3Aqs = util.global.load @"__auto.blk.17.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
%306 = torch_c.from_builtin_tensor %__auto.blk.17.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
%__auto.blk.18.attn_norm.weight = util.global.load @__auto.blk.18.attn_norm.weight : tensor<4096xbf16>
%307 = torch_c.from_builtin_tensor %__auto.blk.18.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.18.attn_q.q_input3Arscale = util.global.load @"__auto.blk.18.attn_q.q_input:rscale" : tensor<f32>
%308 = torch_c.from_builtin_tensor %__auto.blk.18.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.18.attn_q.weight3Aqs = util.global.load @"__auto.blk.18.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%309 = torch_c.from_builtin_tensor %__auto.blk.18.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.18.attn_k.q_input3Arscale = util.global.load @"__auto.blk.18.attn_k.q_input:rscale" : tensor<f32>
%310 = torch_c.from_builtin_tensor %__auto.blk.18.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.18.attn_k.weight3Aqs = util.global.load @"__auto.blk.18.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%311 = torch_c.from_builtin_tensor %__auto.blk.18.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.18.attn_v.q_input3Arscale = util.global.load @"__auto.blk.18.attn_v.q_input:rscale" : tensor<f32>
%312 = torch_c.from_builtin_tensor %__auto.blk.18.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.18.attn_v.weight3Aqs = util.global.load @"__auto.blk.18.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%313 = torch_c.from_builtin_tensor %__auto.blk.18.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.18.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.18.kv_cache.quantizer:rscale" : tensor<f32>
%314 = torch_c.from_builtin_tensor %__auto.blk.18.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.18.attn_output.q_input3Arscale = util.global.load @"__auto.blk.18.attn_output.q_input:rscale" : tensor<f32>
%315 = torch_c.from_builtin_tensor %__auto.blk.18.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.18.attn_output.weight3Aqs = util.global.load @"__auto.blk.18.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%316 = torch_c.from_builtin_tensor %__auto.blk.18.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.18.ffn_norm.weight = util.global.load @__auto.blk.18.ffn_norm.weight : tensor<4096xbf16>
%317 = torch_c.from_builtin_tensor %__auto.blk.18.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.18.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.18.ffn_gate.q_input:rscale" : tensor<f32>
%318 = torch_c.from_builtin_tensor %__auto.blk.18.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.18.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.18.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%319 = torch_c.from_builtin_tensor %__auto.blk.18.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.18.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.18.ffn_up.q_input:rscale" : tensor<f32>
%320 = torch_c.from_builtin_tensor %__auto.blk.18.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.18.ffn_up.weight3Aqs = util.global.load @"__auto.blk.18.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%321 = torch_c.from_builtin_tensor %__auto.blk.18.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.18.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.18.ffn_down.q_input:rscale" : tensor<f32>
%322 = torch_c.from_builtin_tensor %__auto.blk.18.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.18.ffn_down.weight3Aqs = util.global.load @"__auto.blk.18.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
%323 = torch_c.from_builtin_tensor %__auto.blk.18.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
%__auto.blk.19.attn_norm.weight = util.global.load @__auto.blk.19.attn_norm.weight : tensor<4096xbf16>
%324 = torch_c.from_builtin_tensor %__auto.blk.19.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.19.attn_q.q_input3Arscale = util.global.load @"__auto.blk.19.attn_q.q_input:rscale" : tensor<f32>
%325 = torch_c.from_builtin_tensor %__auto.blk.19.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.19.attn_q.weight3Aqs = util.global.load @"__auto.blk.19.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%326 = torch_c.from_builtin_tensor %__auto.blk.19.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.19.attn_k.q_input3Arscale = util.global.load @"__auto.blk.19.attn_k.q_input:rscale" : tensor<f32>
%327 = torch_c.from_builtin_tensor %__auto.blk.19.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.19.attn_k.weight3Aqs = util.global.load @"__auto.blk.19.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%328 = torch_c.from_builtin_tensor %__auto.blk.19.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.19.attn_v.q_input3Arscale = util.global.load @"__auto.blk.19.attn_v.q_input:rscale" : tensor<f32>
%329 = torch_c.from_builtin_tensor %__auto.blk.19.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.19.attn_v.weight3Aqs = util.global.load @"__auto.blk.19.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%330 = torch_c.from_builtin_tensor %__auto.blk.19.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.19.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.19.kv_cache.quantizer:rscale" : tensor<f32>
%331 = torch_c.from_builtin_tensor %__auto.blk.19.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.19.attn_output.q_input3Arscale = util.global.load @"__auto.blk.19.attn_output.q_input:rscale" : tensor<f32>
%332 = torch_c.from_builtin_tensor %__auto.blk.19.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.19.attn_output.weight3Aqs = util.global.load @"__auto.blk.19.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%333 = torch_c.from_builtin_tensor %__auto.blk.19.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.19.ffn_norm.weight = util.global.load @__auto.blk.19.ffn_norm.weight : tensor<4096xbf16>
%334 = torch_c.from_builtin_tensor %__auto.blk.19.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.19.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.19.ffn_gate.q_input:rscale" : tensor<f32>
%335 = torch_c.from_builtin_tensor %__auto.blk.19.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.19.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.19.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%336 = torch_c.from_builtin_tensor %__auto.blk.19.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.19.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.19.ffn_up.q_input:rscale" : tensor<f32>
%337 = torch_c.from_builtin_tensor %__auto.blk.19.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.19.ffn_up.weight3Aqs = util.global.load @"__auto.blk.19.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%338 = torch_c.from_builtin_tensor %__auto.blk.19.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.19.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.19.ffn_down.q_input:rscale" : tensor<f32>
%339 = torch_c.from_builtin_tensor %__auto.blk.19.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.19.ffn_down.weight3Aqs = util.global.load @"__auto.blk.19.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
%340 = torch_c.from_builtin_tensor %__auto.blk.19.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
%__auto.blk.20.attn_norm.weight = util.global.load @__auto.blk.20.attn_norm.weight : tensor<4096xbf16>
%341 = torch_c.from_builtin_tensor %__auto.blk.20.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.20.attn_q.q_input3Arscale = util.global.load @"__auto.blk.20.attn_q.q_input:rscale" : tensor<f32>
%342 = torch_c.from_builtin_tensor %__auto.blk.20.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.20.attn_q.weight3Aqs = util.global.load @"__auto.blk.20.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%343 = torch_c.from_builtin_tensor %__auto.blk.20.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.20.attn_k.q_input3Arscale = util.global.load @"__auto.blk.20.attn_k.q_input:rscale" : tensor<f32>
%344 = torch_c.from_builtin_tensor %__auto.blk.20.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.20.attn_k.weight3Aqs = util.global.load @"__auto.blk.20.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%345 = torch_c.from_builtin_tensor %__auto.blk.20.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.20.attn_v.q_input3Arscale = util.global.load @"__auto.blk.20.attn_v.q_input:rscale" : tensor<f32>
%346 = torch_c.from_builtin_tensor %__auto.blk.20.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.20.attn_v.weight3Aqs = util.global.load @"__auto.blk.20.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%347 = torch_c.from_builtin_tensor %__auto.blk.20.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.20.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.20.kv_cache.quantizer:rscale" : tensor<f32>
%348 = torch_c.from_builtin_tensor %__auto.blk.20.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.20.attn_output.q_input3Arscale = util.global.load @"__auto.blk.20.attn_output.q_input:rscale" : tensor<f32>
%349 = torch_c.from_builtin_tensor %__auto.blk.20.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.20.attn_output.weight3Aqs = util.global.load @"__auto.blk.20.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%350 = torch_c.from_builtin_tensor %__auto.blk.20.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.20.ffn_norm.weight = util.global.load @__auto.blk.20.ffn_norm.weight : tensor<4096xbf16>
%351 = torch_c.from_builtin_tensor %__auto.blk.20.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.20.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.20.ffn_gate.q_input:rscale" : tensor<f32>
%352 = torch_c.from_builtin_tensor %__auto.blk.20.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.20.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.20.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%353 = torch_c.from_builtin_tensor %__auto.blk.20.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.20.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.20.ffn_up.q_input:rscale" : tensor<f32>
%354 = torch_c.from_builtin_tensor %__auto.blk.20.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.20.ffn_up.weight3Aqs = util.global.load @"__auto.blk.20.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%355 = torch_c.from_builtin_tensor %__auto.blk.20.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.20.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.20.ffn_down.q_input:rscale" : tensor<f32>
%356 = torch_c.from_builtin_tensor %__auto.blk.20.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.20.ffn_down.weight3Aqs = util.global.load @"__auto.blk.20.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
%357 = torch_c.from_builtin_tensor %__auto.blk.20.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
%__auto.blk.21.attn_norm.weight = util.global.load @__auto.blk.21.attn_norm.weight : tensor<4096xbf16>
%358 = torch_c.from_builtin_tensor %__auto.blk.21.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.21.attn_q.q_input3Arscale = util.global.load @"__auto.blk.21.attn_q.q_input:rscale" : tensor<f32>
%359 = torch_c.from_builtin_tensor %__auto.blk.21.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.21.attn_q.weight3Aqs = util.global.load @"__auto.blk.21.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%360 = torch_c.from_builtin_tensor %__auto.blk.21.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.21.attn_k.q_input3Arscale = util.global.load @"__auto.blk.21.attn_k.q_input:rscale" : tensor<f32>
%361 = torch_c.from_builtin_tensor %__auto.blk.21.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.21.attn_k.weight3Aqs = util.global.load @"__auto.blk.21.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%362 = torch_c.from_builtin_tensor %__auto.blk.21.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.21.attn_v.q_input3Arscale = util.global.load @"__auto.blk.21.attn_v.q_input:rscale" : tensor<f32>
%363 = torch_c.from_builtin_tensor %__auto.blk.21.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.21.attn_v.weight3Aqs = util.global.load @"__auto.blk.21.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%364 = torch_c.from_builtin_tensor %__auto.blk.21.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.21.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.21.kv_cache.quantizer:rscale" : tensor<f32>
%365 = torch_c.from_builtin_tensor %__auto.blk.21.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.21.attn_output.q_input3Arscale = util.global.load @"__auto.blk.21.attn_output.q_input:rscale" : tensor<f32>
%366 = torch_c.from_builtin_tensor %__auto.blk.21.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.21.attn_output.weight3Aqs = util.global.load @"__auto.blk.21.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%367 = torch_c.from_builtin_tensor %__auto.blk.21.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.21.ffn_norm.weight = util.global.load @__auto.blk.21.ffn_norm.weight : tensor<4096xbf16>
%368 = torch_c.from_builtin_tensor %__auto.blk.21.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.21.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.21.ffn_gate.q_input:rscale" : tensor<f32>
%369 = torch_c.from_builtin_tensor %__auto.blk.21.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.21.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.21.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%370 = torch_c.from_builtin_tensor %__auto.blk.21.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.21.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.21.ffn_up.q_input:rscale" : tensor<f32>
%371 = torch_c.from_builtin_tensor %__auto.blk.21.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.21.ffn_up.weight3Aqs = util.global.load @"__auto.blk.21.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%372 = torch_c.from_builtin_tensor %__auto.blk.21.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.21.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.21.ffn_down.q_input:rscale" : tensor<f32>
%373 = torch_c.from_builtin_tensor %__auto.blk.21.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.21.ffn_down.weight3Aqs = util.global.load @"__auto.blk.21.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
%374 = torch_c.from_builtin_tensor %__auto.blk.21.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
%__auto.blk.22.attn_norm.weight = util.global.load @__auto.blk.22.attn_norm.weight : tensor<4096xbf16>
%375 = torch_c.from_builtin_tensor %__auto.blk.22.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.22.attn_q.q_input3Arscale = util.global.load @"__auto.blk.22.attn_q.q_input:rscale" : tensor<f32>
%376 = torch_c.from_builtin_tensor %__auto.blk.22.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.22.attn_q.weight3Aqs = util.global.load @"__auto.blk.22.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%377 = torch_c.from_builtin_tensor %__auto.blk.22.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.22.attn_k.q_input3Arscale = util.global.load @"__auto.blk.22.attn_k.q_input:rscale" : tensor<f32>
%378 = torch_c.from_builtin_tensor %__auto.blk.22.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.22.attn_k.weight3Aqs = util.global.load @"__auto.blk.22.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%379 = torch_c.from_builtin_tensor %__auto.blk.22.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.22.attn_v.q_input3Arscale = util.global.load @"__auto.blk.22.attn_v.q_input:rscale" : tensor<f32>
%380 = torch_c.from_builtin_tensor %__auto.blk.22.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.22.attn_v.weight3Aqs = util.global.load @"__auto.blk.22.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%381 = torch_c.from_builtin_tensor %__auto.blk.22.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.22.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.22.kv_cache.quantizer:rscale" : tensor<f32>
%382 = torch_c.from_builtin_tensor %__auto.blk.22.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.22.attn_output.q_input3Arscale = util.global.load @"__auto.blk.22.attn_output.q_input:rscale" : tensor<f32>
%383 = torch_c.from_builtin_tensor %__auto.blk.22.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.22.attn_output.weight3Aqs = util.global.load @"__auto.blk.22.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%384 = torch_c.from_builtin_tensor %__auto.blk.22.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.22.ffn_norm.weight = util.global.load @__auto.blk.22.ffn_norm.weight : tensor<4096xbf16>
%385 = torch_c.from_builtin_tensor %__auto.blk.22.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.22.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.22.ffn_gate.q_input:rscale" : tensor<f32>
%386 = torch_c.from_builtin_tensor %__auto.blk.22.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.22.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.22.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%387 = torch_c.from_builtin_tensor %__auto.blk.22.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.22.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.22.ffn_up.q_input:rscale" : tensor<f32>
%388 = torch_c.from_builtin_tensor %__auto.blk.22.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.22.ffn_up.weight3Aqs = util.global.load @"__auto.blk.22.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%389 = torch_c.from_builtin_tensor %__auto.blk.22.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.22.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.22.ffn_down.q_input:rscale" : tensor<f32>
%390 = torch_c.from_builtin_tensor %__auto.blk.22.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.22.ffn_down.weight3Aqs = util.global.load @"__auto.blk.22.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
%391 = torch_c.from_builtin_tensor %__auto.blk.22.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
%__auto.blk.23.attn_norm.weight = util.global.load @__auto.blk.23.attn_norm.weight : tensor<4096xbf16>
%392 = torch_c.from_builtin_tensor %__auto.blk.23.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.23.attn_q.q_input3Arscale = util.global.load @"__auto.blk.23.attn_q.q_input:rscale" : tensor<f32>
%393 = torch_c.from_builtin_tensor %__auto.blk.23.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.23.attn_q.weight3Aqs = util.global.load @"__auto.blk.23.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%394 = torch_c.from_builtin_tensor %__auto.blk.23.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.23.attn_k.q_input3Arscale = util.global.load @"__auto.blk.23.attn_k.q_input:rscale" : tensor<f32>
%395 = torch_c.from_builtin_tensor %__auto.blk.23.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.23.attn_k.weight3Aqs = util.global.load @"__auto.blk.23.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%396 = torch_c.from_builtin_tensor %__auto.blk.23.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.23.attn_v.q_input3Arscale = util.global.load @"__auto.blk.23.attn_v.q_input:rscale" : tensor<f32>
%397 = torch_c.from_builtin_tensor %__auto.blk.23.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.23.attn_v.weight3Aqs = util.global.load @"__auto.blk.23.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%398 = torch_c.from_builtin_tensor %__auto.blk.23.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.23.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.23.kv_cache.quantizer:rscale" : tensor<f32>
%399 = torch_c.from_builtin_tensor %__auto.blk.23.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.23.attn_output.q_input3Arscale = util.global.load @"__auto.blk.23.attn_output.q_input:rscale" : tensor<f32>
%400 = torch_c.from_builtin_tensor %__auto.blk.23.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.23.attn_output.weight3Aqs = util.global.load @"__auto.blk.23.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%401 = torch_c.from_builtin_tensor %__auto.blk.23.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.23.ffn_norm.weight = util.global.load @__auto.blk.23.ffn_norm.weight : tensor<4096xbf16>
%402 = torch_c.from_builtin_tensor %__auto.blk.23.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.23.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.23.ffn_gate.q_input:rscale" : tensor<f32>
%403 = torch_c.from_builtin_tensor %__auto.blk.23.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.23.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.23.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%404 = torch_c.from_builtin_tensor %__auto.blk.23.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.23.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.23.ffn_up.q_input:rscale" : tensor<f32>
%405 = torch_c.from_builtin_tensor %__auto.blk.23.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.23.ffn_up.weight3Aqs = util.global.load @"__auto.blk.23.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%406 = torch_c.from_builtin_tensor %__auto.blk.23.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.23.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.23.ffn_down.q_input:rscale" : tensor<f32>
%407 = torch_c.from_builtin_tensor %__auto.blk.23.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.23.ffn_down.weight3Aqs = util.global.load @"__auto.blk.23.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
%408 = torch_c.from_builtin_tensor %__auto.blk.23.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
%__auto.blk.24.attn_norm.weight = util.global.load @__auto.blk.24.attn_norm.weight : tensor<4096xbf16>
%409 = torch_c.from_builtin_tensor %__auto.blk.24.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.24.attn_q.q_input3Arscale = util.global.load @"__auto.blk.24.attn_q.q_input:rscale" : tensor<f32>
%410 = torch_c.from_builtin_tensor %__auto.blk.24.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.24.attn_q.weight3Aqs = util.global.load @"__auto.blk.24.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%411 = torch_c.from_builtin_tensor %__auto.blk.24.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.24.attn_k.q_input3Arscale = util.global.load @"__auto.blk.24.attn_k.q_input:rscale" : tensor<f32>
%412 = torch_c.from_builtin_tensor %__auto.blk.24.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.24.attn_k.weight3Aqs = util.global.load @"__auto.blk.24.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%413 = torch_c.from_builtin_tensor %__auto.blk.24.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.24.attn_v.q_input3Arscale = util.global.load @"__auto.blk.24.attn_v.q_input:rscale" : tensor<f32>
%414 = torch_c.from_builtin_tensor %__auto.blk.24.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.24.attn_v.weight3Aqs = util.global.load @"__auto.blk.24.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%415 = torch_c.from_builtin_tensor %__auto.blk.24.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.24.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.24.kv_cache.quantizer:rscale" : tensor<f32>
%416 = torch_c.from_builtin_tensor %__auto.blk.24.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.24.attn_output.q_input3Arscale = util.global.load @"__auto.blk.24.attn_output.q_input:rscale" : tensor<f32>
%417 = torch_c.from_builtin_tensor %__auto.blk.24.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.24.attn_output.weight3Aqs = util.global.load @"__auto.blk.24.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%418 = torch_c.from_builtin_tensor %__auto.blk.24.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.24.ffn_norm.weight = util.global.load @__auto.blk.24.ffn_norm.weight : tensor<4096xbf16>
%419 = torch_c.from_builtin_tensor %__auto.blk.24.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.24.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.24.ffn_gate.q_input:rscale" : tensor<f32>
%420 = torch_c.from_builtin_tensor %__auto.blk.24.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.24.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.24.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%421 = torch_c.from_builtin_tensor %__auto.blk.24.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.24.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.24.ffn_up.q_input:rscale" : tensor<f32>
%422 = torch_c.from_builtin_tensor %__auto.blk.24.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.24.ffn_up.weight3Aqs = util.global.load @"__auto.blk.24.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%423 = torch_c.from_builtin_tensor %__auto.blk.24.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.24.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.24.ffn_down.q_input:rscale" : tensor<f32>
%424 = torch_c.from_builtin_tensor %__auto.blk.24.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.24.ffn_down.weight3Aqs = util.global.load @"__auto.blk.24.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
%425 = torch_c.from_builtin_tensor %__auto.blk.24.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
%__auto.blk.25.attn_norm.weight = util.global.load @__auto.blk.25.attn_norm.weight : tensor<4096xbf16>
%426 = torch_c.from_builtin_tensor %__auto.blk.25.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.25.attn_q.q_input3Arscale = util.global.load @"__auto.blk.25.attn_q.q_input:rscale" : tensor<f32>
%427 = torch_c.from_builtin_tensor %__auto.blk.25.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.25.attn_q.weight3Aqs = util.global.load @"__auto.blk.25.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%428 = torch_c.from_builtin_tensor %__auto.blk.25.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.25.attn_k.q_input3Arscale = util.global.load @"__auto.blk.25.attn_k.q_input:rscale" : tensor<f32>
%429 = torch_c.from_builtin_tensor %__auto.blk.25.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.25.attn_k.weight3Aqs = util.global.load @"__auto.blk.25.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%430 = torch_c.from_builtin_tensor %__auto.blk.25.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.25.attn_v.q_input3Arscale = util.global.load @"__auto.blk.25.attn_v.q_input:rscale" : tensor<f32>
%431 = torch_c.from_builtin_tensor %__auto.blk.25.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.25.attn_v.weight3Aqs = util.global.load @"__auto.blk.25.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%432 = torch_c.from_builtin_tensor %__auto.blk.25.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.25.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.25.kv_cache.quantizer:rscale" : tensor<f32>
%433 = torch_c.from_builtin_tensor %__auto.blk.25.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.25.attn_output.q_input3Arscale = util.global.load @"__auto.blk.25.attn_output.q_input:rscale" : tensor<f32>
%434 = torch_c.from_builtin_tensor %__auto.blk.25.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.25.attn_output.weight3Aqs = util.global.load @"__auto.blk.25.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%435 = torch_c.from_builtin_tensor %__auto.blk.25.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.25.ffn_norm.weight = util.global.load @__auto.blk.25.ffn_norm.weight : tensor<4096xbf16>
%436 = torch_c.from_builtin_tensor %__auto.blk.25.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.25.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.25.ffn_gate.q_input:rscale" : tensor<f32>
%437 = torch_c.from_builtin_tensor %__auto.blk.25.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.25.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.25.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%438 = torch_c.from_builtin_tensor %__auto.blk.25.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.25.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.25.ffn_up.q_input:rscale" : tensor<f32>
%439 = torch_c.from_builtin_tensor %__auto.blk.25.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.25.ffn_up.weight3Aqs = util.global.load @"__auto.blk.25.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%440 = torch_c.from_builtin_tensor %__auto.blk.25.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.25.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.25.ffn_down.q_input:rscale" : tensor<f32>
%441 = torch_c.from_builtin_tensor %__auto.blk.25.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.25.ffn_down.weight3Aqs = util.global.load @"__auto.blk.25.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
%442 = torch_c.from_builtin_tensor %__auto.blk.25.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
%__auto.blk.26.attn_norm.weight = util.global.load @__auto.blk.26.attn_norm.weight : tensor<4096xbf16>
%443 = torch_c.from_builtin_tensor %__auto.blk.26.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.26.attn_q.q_input3Arscale = util.global.load @"__auto.blk.26.attn_q.q_input:rscale" : tensor<f32>
%444 = torch_c.from_builtin_tensor %__auto.blk.26.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.26.attn_q.weight3Aqs = util.global.load @"__auto.blk.26.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%445 = torch_c.from_builtin_tensor %__auto.blk.26.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.26.attn_k.q_input3Arscale = util.global.load @"__auto.blk.26.attn_k.q_input:rscale" : tensor<f32>
%446 = torch_c.from_builtin_tensor %__auto.blk.26.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.26.attn_k.weight3Aqs = util.global.load @"__auto.blk.26.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%447 = torch_c.from_builtin_tensor %__auto.blk.26.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.26.attn_v.q_input3Arscale = util.global.load @"__auto.blk.26.attn_v.q_input:rscale" : tensor<f32>
%448 = torch_c.from_builtin_tensor %__auto.blk.26.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.26.attn_v.weight3Aqs = util.global.load @"__auto.blk.26.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%449 = torch_c.from_builtin_tensor %__auto.blk.26.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.26.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.26.kv_cache.quantizer:rscale" : tensor<f32>
%450 = torch_c.from_builtin_tensor %__auto.blk.26.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.26.attn_output.q_input3Arscale = util.global.load @"__auto.blk.26.attn_output.q_input:rscale" : tensor<f32>
%451 = torch_c.from_builtin_tensor %__auto.blk.26.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.26.attn_output.weight3Aqs = util.global.load @"__auto.blk.26.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%452 = torch_c.from_builtin_tensor %__auto.blk.26.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.26.ffn_norm.weight = util.global.load @__auto.blk.26.ffn_norm.weight : tensor<4096xbf16>
%453 = torch_c.from_builtin_tensor %__auto.blk.26.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.26.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.26.ffn_gate.q_input:rscale" : tensor<f32>
%454 = torch_c.from_builtin_tensor %__auto.blk.26.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.26.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.26.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%455 = torch_c.from_builtin_tensor %__auto.blk.26.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.26.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.26.ffn_up.q_input:rscale" : tensor<f32>
%456 = torch_c.from_builtin_tensor %__auto.blk.26.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.26.ffn_up.weight3Aqs = util.global.load @"__auto.blk.26.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%457 = torch_c.from_builtin_tensor %__auto.blk.26.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.26.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.26.ffn_down.q_input:rscale" : tensor<f32>
%458 = torch_c.from_builtin_tensor %__auto.blk.26.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.26.ffn_down.weight3Aqs = util.global.load @"__auto.blk.26.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
%459 = torch_c.from_builtin_tensor %__auto.blk.26.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
%__auto.blk.27.attn_norm.weight = util.global.load @__auto.blk.27.attn_norm.weight : tensor<4096xbf16>
%460 = torch_c.from_builtin_tensor %__auto.blk.27.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.27.attn_q.q_input3Arscale = util.global.load @"__auto.blk.27.attn_q.q_input:rscale" : tensor<f32>
%461 = torch_c.from_builtin_tensor %__auto.blk.27.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.27.attn_q.weight3Aqs = util.global.load @"__auto.blk.27.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%462 = torch_c.from_builtin_tensor %__auto.blk.27.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.27.attn_k.q_input3Arscale = util.global.load @"__auto.blk.27.attn_k.q_input:rscale" : tensor<f32>
%463 = torch_c.from_builtin_tensor %__auto.blk.27.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.27.attn_k.weight3Aqs = util.global.load @"__auto.blk.27.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%464 = torch_c.from_builtin_tensor %__auto.blk.27.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.27.attn_v.q_input3Arscale = util.global.load @"__auto.blk.27.attn_v.q_input:rscale" : tensor<f32>
%465 = torch_c.from_builtin_tensor %__auto.blk.27.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.27.attn_v.weight3Aqs = util.global.load @"__auto.blk.27.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%466 = torch_c.from_builtin_tensor %__auto.blk.27.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.27.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.27.kv_cache.quantizer:rscale" : tensor<f32>
%467 = torch_c.from_builtin_tensor %__auto.blk.27.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.27.attn_output.q_input3Arscale = util.global.load @"__auto.blk.27.attn_output.q_input:rscale" : tensor<f32>
%468 = torch_c.from_builtin_tensor %__auto.blk.27.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.27.attn_output.weight3Aqs = util.global.load @"__auto.blk.27.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%469 = torch_c.from_builtin_tensor %__auto.blk.27.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.27.ffn_norm.weight = util.global.load @__auto.blk.27.ffn_norm.weight : tensor<4096xbf16>
%470 = torch_c.from_builtin_tensor %__auto.blk.27.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.27.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.27.ffn_gate.q_input:rscale" : tensor<f32>
%471 = torch_c.from_builtin_tensor %__auto.blk.27.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.27.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.27.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%472 = torch_c.from_builtin_tensor %__auto.blk.27.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.27.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.27.ffn_up.q_input:rscale" : tensor<f32>
%473 = torch_c.from_builtin_tensor %__auto.blk.27.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.27.ffn_up.weight3Aqs = util.global.load @"__auto.blk.27.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%474 = torch_c.from_builtin_tensor %__auto.blk.27.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.27.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.27.ffn_down.q_input:rscale" : tensor<f32>
%475 = torch_c.from_builtin_tensor %__auto.blk.27.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.27.ffn_down.weight3Aqs = util.global.load @"__auto.blk.27.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
%476 = torch_c.from_builtin_tensor %__auto.blk.27.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
%__auto.blk.28.attn_norm.weight = util.global.load @__auto.blk.28.attn_norm.weight : tensor<4096xbf16>
%477 = torch_c.from_builtin_tensor %__auto.blk.28.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.28.attn_q.q_input3Arscale = util.global.load @"__auto.blk.28.attn_q.q_input:rscale" : tensor<f32>
%478 = torch_c.from_builtin_tensor %__auto.blk.28.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.28.attn_q.weight3Aqs = util.global.load @"__auto.blk.28.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%479 = torch_c.from_builtin_tensor %__auto.blk.28.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.28.attn_k.q_input3Arscale = util.global.load @"__auto.blk.28.attn_k.q_input:rscale" : tensor<f32>
%480 = torch_c.from_builtin_tensor %__auto.blk.28.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.28.attn_k.weight3Aqs = util.global.load @"__auto.blk.28.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%481 = torch_c.from_builtin_tensor %__auto.blk.28.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.28.attn_v.q_input3Arscale = util.global.load @"__auto.blk.28.attn_v.q_input:rscale" : tensor<f32>
%482 = torch_c.from_builtin_tensor %__auto.blk.28.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.28.attn_v.weight3Aqs = util.global.load @"__auto.blk.28.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%483 = torch_c.from_builtin_tensor %__auto.blk.28.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.28.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.28.kv_cache.quantizer:rscale" : tensor<f32>
%484 = torch_c.from_builtin_tensor %__auto.blk.28.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.28.attn_output.q_input3Arscale = util.global.load @"__auto.blk.28.attn_output.q_input:rscale" : tensor<f32>
%485 = torch_c.from_builtin_tensor %__auto.blk.28.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.28.attn_output.weight3Aqs = util.global.load @"__auto.blk.28.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%486 = torch_c.from_builtin_tensor %__auto.blk.28.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.28.ffn_norm.weight = util.global.load @__auto.blk.28.ffn_norm.weight : tensor<4096xbf16>
%487 = torch_c.from_builtin_tensor %__auto.blk.28.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.28.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.28.ffn_gate.q_input:rscale" : tensor<f32>
%488 = torch_c.from_builtin_tensor %__auto.blk.28.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.28.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.28.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%489 = torch_c.from_builtin_tensor %__auto.blk.28.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.28.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.28.ffn_up.q_input:rscale" : tensor<f32>
%490 = torch_c.from_builtin_tensor %__auto.blk.28.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.28.ffn_up.weight3Aqs = util.global.load @"__auto.blk.28.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%491 = torch_c.from_builtin_tensor %__auto.blk.28.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.28.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.28.ffn_down.q_input:rscale" : tensor<f32>
%492 = torch_c.from_builtin_tensor %__auto.blk.28.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.28.ffn_down.weight3Aqs = util.global.load @"__auto.blk.28.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
%493 = torch_c.from_builtin_tensor %__auto.blk.28.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
%__auto.blk.29.attn_norm.weight = util.global.load @__auto.blk.29.attn_norm.weight : tensor<4096xbf16>
%494 = torch_c.from_builtin_tensor %__auto.blk.29.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.29.attn_q.q_input3Arscale = util.global.load @"__auto.blk.29.attn_q.q_input:rscale" : tensor<f32>
%495 = torch_c.from_builtin_tensor %__auto.blk.29.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.29.attn_q.weight3Aqs = util.global.load @"__auto.blk.29.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%496 = torch_c.from_builtin_tensor %__auto.blk.29.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.29.attn_k.q_input3Arscale = util.global.load @"__auto.blk.29.attn_k.q_input:rscale" : tensor<f32>
%497 = torch_c.from_builtin_tensor %__auto.blk.29.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.29.attn_k.weight3Aqs = util.global.load @"__auto.blk.29.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%498 = torch_c.from_builtin_tensor %__auto.blk.29.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.29.attn_v.q_input3Arscale = util.global.load @"__auto.blk.29.attn_v.q_input:rscale" : tensor<f32>
%499 = torch_c.from_builtin_tensor %__auto.blk.29.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.29.attn_v.weight3Aqs = util.global.load @"__auto.blk.29.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%500 = torch_c.from_builtin_tensor %__auto.blk.29.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.29.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.29.kv_cache.quantizer:rscale" : tensor<f32>
%501 = torch_c.from_builtin_tensor %__auto.blk.29.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.29.attn_output.q_input3Arscale = util.global.load @"__auto.blk.29.attn_output.q_input:rscale" : tensor<f32>
%502 = torch_c.from_builtin_tensor %__auto.blk.29.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.29.attn_output.weight3Aqs = util.global.load @"__auto.blk.29.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%503 = torch_c.from_builtin_tensor %__auto.blk.29.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.29.ffn_norm.weight = util.global.load @__auto.blk.29.ffn_norm.weight : tensor<4096xbf16>
%504 = torch_c.from_builtin_tensor %__auto.blk.29.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.29.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.29.ffn_gate.q_input:rscale" : tensor<f32>
%505 = torch_c.from_builtin_tensor %__auto.blk.29.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.29.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.29.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%506 = torch_c.from_builtin_tensor %__auto.blk.29.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.29.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.29.ffn_up.q_input:rscale" : tensor<f32>
%507 = torch_c.from_builtin_tensor %__auto.blk.29.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.29.ffn_up.weight3Aqs = util.global.load @"__auto.blk.29.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%508 = torch_c.from_builtin_tensor %__auto.blk.29.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.29.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.29.ffn_down.q_input:rscale" : tensor<f32>
%509 = torch_c.from_builtin_tensor %__auto.blk.29.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.29.ffn_down.weight3Aqs = util.global.load @"__auto.blk.29.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
%510 = torch_c.from_builtin_tensor %__auto.blk.29.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
%__auto.blk.30.attn_norm.weight = util.global.load @__auto.blk.30.attn_norm.weight : tensor<4096xbf16>
%511 = torch_c.from_builtin_tensor %__auto.blk.30.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.30.attn_q.q_input3Arscale = util.global.load @"__auto.blk.30.attn_q.q_input:rscale" : tensor<f32>
%512 = torch_c.from_builtin_tensor %__auto.blk.30.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.30.attn_q.weight3Aqs = util.global.load @"__auto.blk.30.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%513 = torch_c.from_builtin_tensor %__auto.blk.30.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.30.attn_k.q_input3Arscale = util.global.load @"__auto.blk.30.attn_k.q_input:rscale" : tensor<f32>
%514 = torch_c.from_builtin_tensor %__auto.blk.30.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.30.attn_k.weight3Aqs = util.global.load @"__auto.blk.30.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%515 = torch_c.from_builtin_tensor %__auto.blk.30.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.30.attn_v.q_input3Arscale = util.global.load @"__auto.blk.30.attn_v.q_input:rscale" : tensor<f32>
%516 = torch_c.from_builtin_tensor %__auto.blk.30.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.30.attn_v.weight3Aqs = util.global.load @"__auto.blk.30.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%517 = torch_c.from_builtin_tensor %__auto.blk.30.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.30.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.30.kv_cache.quantizer:rscale" : tensor<f32>
%518 = torch_c.from_builtin_tensor %__auto.blk.30.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.30.attn_output.q_input3Arscale = util.global.load @"__auto.blk.30.attn_output.q_input:rscale" : tensor<f32>
%519 = torch_c.from_builtin_tensor %__auto.blk.30.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.30.attn_output.weight3Aqs = util.global.load @"__auto.blk.30.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%520 = torch_c.from_builtin_tensor %__auto.blk.30.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.30.ffn_norm.weight = util.global.load @__auto.blk.30.ffn_norm.weight : tensor<4096xbf16>
%521 = torch_c.from_builtin_tensor %__auto.blk.30.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.30.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.30.ffn_gate.q_input:rscale" : tensor<f32>
%522 = torch_c.from_builtin_tensor %__auto.blk.30.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.30.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.30.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%523 = torch_c.from_builtin_tensor %__auto.blk.30.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.30.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.30.ffn_up.q_input:rscale" : tensor<f32>
%524 = torch_c.from_builtin_tensor %__auto.blk.30.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.30.ffn_up.weight3Aqs = util.global.load @"__auto.blk.30.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%525 = torch_c.from_builtin_tensor %__auto.blk.30.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.30.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.30.ffn_down.q_input:rscale" : tensor<f32>
%526 = torch_c.from_builtin_tensor %__auto.blk.30.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.30.ffn_down.weight3Aqs = util.global.load @"__auto.blk.30.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
%527 = torch_c.from_builtin_tensor %__auto.blk.30.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
%__auto.blk.31.attn_norm.weight = util.global.load @__auto.blk.31.attn_norm.weight : tensor<4096xbf16>
%528 = torch_c.from_builtin_tensor %__auto.blk.31.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.31.attn_q.q_input3Arscale = util.global.load @"__auto.blk.31.attn_q.q_input:rscale" : tensor<f32>
%529 = torch_c.from_builtin_tensor %__auto.blk.31.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.31.attn_q.weight3Aqs = util.global.load @"__auto.blk.31.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%530 = torch_c.from_builtin_tensor %__auto.blk.31.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.31.attn_k.q_input3Arscale = util.global.load @"__auto.blk.31.attn_k.q_input:rscale" : tensor<f32>
%531 = torch_c.from_builtin_tensor %__auto.blk.31.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.31.attn_k.weight3Aqs = util.global.load @"__auto.blk.31.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%532 = torch_c.from_builtin_tensor %__auto.blk.31.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.31.attn_v.q_input3Arscale = util.global.load @"__auto.blk.31.attn_v.q_input:rscale" : tensor<f32>
%533 = torch_c.from_builtin_tensor %__auto.blk.31.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.31.attn_v.weight3Aqs = util.global.load @"__auto.blk.31.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%534 = torch_c.from_builtin_tensor %__auto.blk.31.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.31.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.31.kv_cache.quantizer:rscale" : tensor<f32>
%535 = torch_c.from_builtin_tensor %__auto.blk.31.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.31.attn_output.q_input3Arscale = util.global.load @"__auto.blk.31.attn_output.q_input:rscale" : tensor<f32>
%536 = torch_c.from_builtin_tensor %__auto.blk.31.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.31.attn_output.weight3Aqs = util.global.load @"__auto.blk.31.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%537 = torch_c.from_builtin_tensor %__auto.blk.31.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.31.ffn_norm.weight = util.global.load @__auto.blk.31.ffn_norm.weight : tensor<4096xbf16>
%538 = torch_c.from_builtin_tensor %__auto.blk.31.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.31.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.31.ffn_gate.q_input:rscale" : tensor<f32>
%539 = torch_c.from_builtin_tensor %__auto.blk.31.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.31.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.31.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%540 = torch_c.from_builtin_tensor %__auto.blk.31.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.31.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.31.ffn_up.q_input:rscale" : tensor<f32>
%541 = torch_c.from_builtin_tensor %__auto.blk.31.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.31.ffn_up.weight3Aqs = util.global.load @"__auto.blk.31.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%542 = torch_c.from_builtin_tensor %__auto.blk.31.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.31.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.31.ffn_down.q_input:rscale" : tensor<f32>
%543 = torch_c.from_builtin_tensor %__auto.blk.31.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.31.ffn_down.weight3Aqs = util.global.load @"__auto.blk.31.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
%544 = torch_c.from_builtin_tensor %__auto.blk.31.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
%__auto.output_norm.weight = util.global.load @__auto.output_norm.weight : tensor<4096xbf16>
%545 = torch_c.from_builtin_tensor %__auto.output_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.output.weight = util.global.load @__auto.output.weight : tensor<128256x4096xbf16>
%546 = torch_c.from_builtin_tensor %__auto.output.weight : tensor<128256x4096xbf16> -> !torch.vtensor<[128256,4096],bf16>
%547 = torch.copy.to_vtensor %arg3 : !torch.vtensor<[?,2097152],f16>
%548 = torch.symbolic_int "s1" {min_val = 2, max_val = 4095} : !torch.int
%549 = torch.symbolic_int "s2" {min_val = 2, max_val = 9223372036854775806} : !torch.int
torch.bind_symbolic_shape %arg0, [%548], affine_map<()[s0] -> (1, s0 * 32)> : !torch.vtensor<[1,?],si64>
torch.bind_symbolic_shape %arg2, [%548], affine_map<()[s0] -> (1, s0)> : !torch.vtensor<[1,?],si64>
torch.bind_symbolic_shape %547, [%549], affine_map<()[s0] -> (s0, 2097152)> : !torch.vtensor<[?,2097152],f16>
%int5 = torch.constant.int 5
%550 = torch.prims.convert_element_type %0, %int5 : !torch.vtensor<[128256,4096],bf16>, !torch.int -> !torch.vtensor<[128256,4096],f16>
%int-1 = torch.constant.int -1
%false = torch.constant.bool false
%false_0 = torch.constant.bool false
%551 = torch.aten.embedding %550, %arg0, %int-1, %false, %false_0 : !torch.vtensor<[128256,4096],f16>, !torch.vtensor<[1,?],si64>, !torch.int, !torch.bool, !torch.bool -> !torch.vtensor<[1,?,4096],f16>
torch.bind_symbolic_shape %551, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f16>
%int6 = torch.constant.int 6
%552 = torch.prims.convert_element_type %551, %int6 : !torch.vtensor<[1,?,4096],f16>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %552, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%int2 = torch.constant.int 2
%553 = torch.aten.pow.Tensor_Scalar %552, %int2 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %553, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%int-1_1 = torch.constant.int -1
%554 = torch.prim.ListConstruct %int-1_1 : (!torch.int) -> !torch.list<int>
%true = torch.constant.bool true
%none = torch.constant.none
%555 = torch.aten.mean.dim %553, %554, %true, %none : !torch.vtensor<[1,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,?,1],f32>
torch.bind_symbolic_shape %555, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
%float1.000000e-05 = torch.constant.float 1.000000e-05
%int1 = torch.constant.int 1
%556 = torch.aten.add.Scalar %555, %float1.000000e-05, %int1 : !torch.vtensor<[1,?,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,?,1],f32>
torch.bind_symbolic_shape %556, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
%557 = torch.aten.rsqrt %556 : !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,1],f32>
torch.bind_symbolic_shape %557, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
%558 = torch.aten.mul.Tensor %552, %557 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %558, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%int5_2 = torch.constant.int 5
%559 = torch.prims.convert_element_type %558, %int5_2 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f16>
torch.bind_symbolic_shape %559, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f16>
%560 = torch.aten.mul.Tensor %1, %559 : !torch.vtensor<[4096],bf16>, !torch.vtensor<[1,?,4096],f16> -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %560, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%int5_3 = torch.constant.int 5
%561 = torch.prims.convert_element_type %560, %int5_3 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f16>
torch.bind_symbolic_shape %561, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f16>
%562 = torch.aten.div.Tensor %561, %2 : !torch.vtensor<[1,?,4096],f16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f16>
torch.bind_symbolic_shape %562, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f16>
%float-2.400000e02 = torch.constant.float -2.400000e+02
%float2.400000e02 = torch.constant.float 2.400000e+02
%563 = torch.aten.clamp %562, %float-2.400000e02, %float2.400000e02 : !torch.vtensor<[1,?,4096],f16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f16>
torch.bind_symbolic_shape %563, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f16>
%int26 = torch.constant.int 26
%564 = torch.prims.convert_element_type %563, %int26 : !torch.vtensor<[1,?,4096],f16>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %564, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
%int-2 = torch.constant.int -2
%int-1_4 = torch.constant.int -1
%565 = torch.aten.transpose.int %3, %int-2, %int-1_4 : !torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%int1_5 = torch.constant.int 1
%566 = torch.aten.size.int %arg0, %int1_5 : !torch.vtensor<[1,?],si64>, !torch.int -> !torch.int
%int4096 = torch.constant.int 4096
%567 = torch.prim.ListConstruct %566, %int4096 : (!torch.int, !torch.int) -> !torch.list<int>
%568 = torch.aten.view %564, %567 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %568, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
%569 = torch.aten.mm %568, %565 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %569, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
%int1_6 = torch.constant.int 1
%int4096_7 = torch.constant.int 4096
%570 = torch.prim.ListConstruct %int1_6, %566, %int4096_7 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%571 = torch.aten.view %569, %570 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %571, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
%int15 = torch.constant.int 15
%572 = torch.prims.convert_element_type %571, %int15 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
torch.bind_symbolic_shape %572, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
%573 = torch.aten.div.Tensor %561, %4 : !torch.vtensor<[1,?,4096],f16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f16>
torch.bind_symbolic_shape %573, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f16>
%float-2.400000e02_8 = torch.constant.float -2.400000e+02
%float2.400000e02_9 = torch.constant.float 2.400000e+02
%574 = torch.aten.clamp %573, %float-2.400000e02_8, %float2.400000e02_9 : !torch.vtensor<[1,?,4096],f16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f16>
torch.bind_symbolic_shape %574, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f16>
%int26_10 = torch.constant.int 26
%575 = torch.prims.convert_element_type %574, %int26_10 : !torch.vtensor<[1,?,4096],f16>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %575, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
%int-2_11 = torch.constant.int -2
%int-1_12 = torch.constant.int -1
%576 = torch.aten.transpose.int %5, %int-2_11, %int-1_12 : !torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,1024],f8E4M3FNUZ>
%int4096_13 = torch.constant.int 4096
%577 = torch.prim.ListConstruct %566, %int4096_13 : (!torch.int, !torch.int) -> !torch.list<int>
%578 = torch.aten.view %575, %577 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %578, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
%579 = torch.aten.mm %578, %576 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,1024],f8E4M3FNUZ> -> !torch.vtensor<[?,1024],f8E4M3FNUZ>
torch.bind_symbolic_shape %579, [%548], affine_map<()[s0] -> (s0 * 32, 1024)> : !torch.vtensor<[?,1024],f8E4M3FNUZ>
%int1_14 = torch.constant.int 1
%int1024 = torch.constant.int 1024
%580 = torch.prim.ListConstruct %int1_14, %566, %int1024 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%581 = torch.aten.view %579, %580 : !torch.vtensor<[?,1024],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
torch.bind_symbolic_shape %581, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
%int15_15 = torch.constant.int 15
%582 = torch.prims.convert_element_type %581, %int15_15 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,1024],bf16>
torch.bind_symbolic_shape %582, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],bf16>
%583 = torch.aten.div.Tensor %561, %6 : !torch.vtensor<[1,?,4096],f16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f16>
torch.bind_symbolic_shape %583, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f16>
%float-2.400000e02_16 = torch.constant.float -2.400000e+02
%float2.400000e02_17 = torch.constant.float 2.400000e+02
%584 = torch.aten.clamp %583, %float-2.400000e02_16, %float2.400000e02_17 : !torch.vtensor<[1,?,4096],f16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f16>
torch.bind_symbolic_shape %584, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f16>
%int26_18 = torch.constant.int 26
%585 = torch.prims.convert_element_type %584, %int26_18 : !torch.vtensor<[1,?,4096],f16>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %585, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
%int-2_19 = torch.constant.int -2
%int-1_20 = torch.constant.int -1
%586 = torch.aten.transpose.int %7, %int-2_19, %int-1_20 : !torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,1024],f8E4M3FNUZ>
%int4096_21 = torch.constant.int 4096
%587 = torch.prim.ListConstruct %566, %int4096_21 : (!torch.int, !torch.int) -> !torch.list<int>
%588 = torch.aten.view %585, %587 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %588, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
%589 = torch.aten.mm %588, %586 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,1024],f8E4M3FNUZ> -> !torch.vtensor<[?,1024],f8E4M3FNUZ>
torch.bind_symbolic_shape %589, [%548], affine_map<()[s0] -> (s0 * 32, 1024)> : !torch.vtensor<[?,1024],f8E4M3FNUZ>
%int1_22 = torch.constant.int 1
%int1024_23 = torch.constant.int 1024
%590 = torch.prim.ListConstruct %int1_22, %566, %int1024_23 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%591 = torch.aten.view %589, %590 : !torch.vtensor<[?,1024],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
torch.bind_symbolic_shape %591, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
%int15_24 = torch.constant.int 15
%592 = torch.prims.convert_element_type %591, %int15_24 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,1024],bf16>
torch.bind_symbolic_shape %592, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],bf16>
%int1_25 = torch.constant.int 1
%int32 = torch.constant.int 32
%int128 = torch.constant.int 128
%593 = torch.prim.ListConstruct %int1_25, %566, %int32, %int128 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%594 = torch.aten.view %572, %593 : !torch.vtensor<[1,?,4096],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,32,128],bf16>
torch.bind_symbolic_shape %594, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
%int1_26 = torch.constant.int 1
%int8 = torch.constant.int 8
%int128_27 = torch.constant.int 128
%595 = torch.prim.ListConstruct %int1_26, %566, %int8, %int128_27 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%596 = torch.aten.view %582, %595 : !torch.vtensor<[1,?,1024],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,8,128],bf16>
torch.bind_symbolic_shape %596, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
%int1_28 = torch.constant.int 1
%int8_29 = torch.constant.int 8
%int128_30 = torch.constant.int 128
%597 = torch.prim.ListConstruct %int1_28, %566, %int8_29, %int128_30 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%598 = torch.aten.view %592, %597 : !torch.vtensor<[1,?,1024],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,8,128],bf16>
torch.bind_symbolic_shape %598, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
%int131072 = torch.constant.int 131072
%none_31 = torch.constant.none
%none_32 = torch.constant.none
%cpu = torch.constant.device "cpu"
%false_33 = torch.constant.bool false
%599 = torch.aten.arange %int131072, %none_31, %none_32, %cpu, %false_33 : !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[131072],si64>
%int0 = torch.constant.int 0
%int128_34 = torch.constant.int 128
%none_35 = torch.constant.none
%none_36 = torch.constant.none
%cpu_37 = torch.constant.device "cpu"
%false_38 = torch.constant.bool false
%600 = torch.aten.arange.start %int0, %int128_34, %none_35, %none_36, %cpu_37, %false_38 : !torch.int, !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[128],si64>
%int2_39 = torch.constant.int 2
%601 = torch.aten.floor_divide.Scalar %600, %int2_39 : !torch.vtensor<[128],si64>, !torch.int -> !torch.vtensor<[128],si64>
%int6_40 = torch.constant.int 6
%602 = torch.prims.convert_element_type %601, %int6_40 : !torch.vtensor<[128],si64>, !torch.int -> !torch.vtensor<[128],f32>
%int128_41 = torch.constant.int 128
%603 = torch.aten.div.Scalar %602, %int128_41 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[128],f32>
%float2.000000e00 = torch.constant.float 2.000000e+00
%604 = torch.aten.mul.Scalar %603, %float2.000000e00 : !torch.vtensor<[128],f32>, !torch.float -> !torch.vtensor<[128],f32>
%float5.000000e05 = torch.constant.float 5.000000e+05
%605 = torch.aten.pow.Scalar %float5.000000e05, %604 : !torch.float, !torch.vtensor<[128],f32> -> !torch.vtensor<[128],f32>
%606 = torch.aten.reciprocal %605 : !torch.vtensor<[128],f32> -> !torch.vtensor<[128],f32>
%float1.000000e00 = torch.constant.float 1.000000e+00
%607 = torch.aten.mul.Scalar %606, %float1.000000e00 : !torch.vtensor<[128],f32>, !torch.float -> !torch.vtensor<[128],f32>
%int131072_42 = torch.constant.int 131072
%int1_43 = torch.constant.int 1
%608 = torch.prim.ListConstruct %int131072_42, %int1_43 : (!torch.int, !torch.int) -> !torch.list<int>
%609 = torch.aten.view %599, %608 : !torch.vtensor<[131072],si64>, !torch.list<int> -> !torch.vtensor<[131072,1],si64>
%610 = torch.aten.mul.Tensor %609, %607 : !torch.vtensor<[131072,1],si64>, !torch.vtensor<[128],f32> -> !torch.vtensor<[131072,128],f32>
%int1_44 = torch.constant.int 1
%611 = torch.aten.size.int %571, %int1_44 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.int
%int0_45 = torch.constant.int 0
%612 = torch.aten.add.int %int0_45, %611 : !torch.int, !torch.int -> !torch.int
%int0_46 = torch.constant.int 0
%int0_47 = torch.constant.int 0
%int1_48 = torch.constant.int 1
%613 = torch.aten.slice.Tensor %610, %int0_46, %int0_47, %612, %int1_48 : !torch.vtensor<[131072,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
torch.bind_symbolic_shape %613, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
%int1_49 = torch.constant.int 1
%int0_50 = torch.constant.int 0
%int9223372036854775807 = torch.constant.int 9223372036854775807
%int1_51 = torch.constant.int 1
%614 = torch.aten.slice.Tensor %613, %int1_49, %int0_50, %int9223372036854775807, %int1_51 : !torch.vtensor<[?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
torch.bind_symbolic_shape %614, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
%int1_52 = torch.constant.int 1
%int0_53 = torch.constant.int 0
%int9223372036854775807_54 = torch.constant.int 9223372036854775807
%int1_55 = torch.constant.int 1
%615 = torch.aten.slice.Tensor %614, %int1_52, %int0_53, %int9223372036854775807_54, %int1_55 : !torch.vtensor<[?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
torch.bind_symbolic_shape %615, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
%int0_56 = torch.constant.int 0
%616 = torch.aten.unsqueeze %615, %int0_56 : !torch.vtensor<[?,128],f32>, !torch.int -> !torch.vtensor<[1,?,128],f32>
torch.bind_symbolic_shape %616, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
%int1_57 = torch.constant.int 1
%int0_58 = torch.constant.int 0
%int9223372036854775807_59 = torch.constant.int 9223372036854775807
%int1_60 = torch.constant.int 1
%617 = torch.aten.slice.Tensor %616, %int1_57, %int0_58, %int9223372036854775807_59, %int1_60 : !torch.vtensor<[1,?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],f32>
torch.bind_symbolic_shape %617, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
%int2_61 = torch.constant.int 2
%int0_62 = torch.constant.int 0
%int9223372036854775807_63 = torch.constant.int 9223372036854775807
%int1_64 = torch.constant.int 1
%618 = torch.aten.slice.Tensor %617, %int2_61, %int0_62, %int9223372036854775807_63, %int1_64 : !torch.vtensor<[1,?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],f32>
torch.bind_symbolic_shape %618, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
%int1_65 = torch.constant.int 1
%int1_66 = torch.constant.int 1
%int1_67 = torch.constant.int 1
%619 = torch.prim.ListConstruct %int1_65, %int1_66, %int1_67 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%620 = torch.aten.repeat %618, %619 : !torch.vtensor<[1,?,128],f32>, !torch.list<int> -> !torch.vtensor<[1,?,128],f32>
torch.bind_symbolic_shape %620, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
%int6_68 = torch.constant.int 6
%621 = torch.prims.convert_element_type %594, %int6_68 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int -> !torch.vtensor<[1,?,32,128],f32>
torch.bind_symbolic_shape %621, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
%622 = torch_c.to_builtin_tensor %621 : !torch.vtensor<[1,?,32,128],f32> -> tensor<1x?x32x128xf32>
%623 = torch_c.to_builtin_tensor %620 : !torch.vtensor<[1,?,128],f32> -> tensor<1x?x128xf32>
%624 = util.call @sharktank_rotary_embedding_1_D_32_128_f32(%622, %623) : (tensor<1x?x32x128xf32>, tensor<1x?x128xf32>) -> tensor<1x?x32x128xf32>
%625 = torch_c.from_builtin_tensor %624 : tensor<1x?x32x128xf32> -> !torch.vtensor<[1,?,32,128],f32>
torch.bind_symbolic_shape %625, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
%int15_69 = torch.constant.int 15
%626 = torch.prims.convert_element_type %625, %int15_69 : !torch.vtensor<[1,?,32,128],f32>, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
torch.bind_symbolic_shape %626, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
%int131072_70 = torch.constant.int 131072
%none_71 = torch.constant.none
%none_72 = torch.constant.none
%cpu_73 = torch.constant.device "cpu"
%false_74 = torch.constant.bool false
%627 = torch.aten.arange %int131072_70, %none_71, %none_72, %cpu_73, %false_74 : !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[131072],si64>
%int0_75 = torch.constant.int 0
%int128_76 = torch.constant.int 128
%none_77 = torch.constant.none
%none_78 = torch.constant.none
%cpu_79 = torch.constant.device "cpu"
%false_80 = torch.constant.bool false
%628 = torch.aten.arange.start %int0_75, %int128_76, %none_77, %none_78, %cpu_79, %false_80 : !torch.int, !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[128],si64>
%int2_81 = torch.constant.int 2
%629 = torch.aten.floor_divide.Scalar %628, %int2_81 : !torch.vtensor<[128],si64>, !torch.int -> !torch.vtensor<[128],si64>
%int6_82 = torch.constant.int 6
%630 = torch.prims.convert_element_type %629, %int6_82 : !torch.vtensor<[128],si64>, !torch.int -> !torch.vtensor<[128],f32>
%int128_83 = torch.constant.int 128
%631 = torch.aten.div.Scalar %630, %int128_83 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[128],f32>
%float2.000000e00_84 = torch.constant.float 2.000000e+00
%632 = torch.aten.mul.Scalar %631, %float2.000000e00_84 : !torch.vtensor<[128],f32>, !torch.float -> !torch.vtensor<[128],f32>
%float5.000000e05_85 = torch.constant.float 5.000000e+05
%633 = torch.aten.pow.Scalar %float5.000000e05_85, %632 : !torch.float, !torch.vtensor<[128],f32> -> !torch.vtensor<[128],f32>
%634 = torch.aten.reciprocal %633 : !torch.vtensor<[128],f32> -> !torch.vtensor<[128],f32>
%float1.000000e00_86 = torch.constant.float 1.000000e+00
%635 = torch.aten.mul.Scalar %634, %float1.000000e00_86 : !torch.vtensor<[128],f32>, !torch.float -> !torch.vtensor<[128],f32>
%int131072_87 = torch.constant.int 131072
%int1_88 = torch.constant.int 1
%636 = torch.prim.ListConstruct %int131072_87, %int1_88 : (!torch.int, !torch.int) -> !torch.list<int>
%637 = torch.aten.view %627, %636 : !torch.vtensor<[131072],si64>, !torch.list<int> -> !torch.vtensor<[131072,1],si64>
%638 = torch.aten.mul.Tensor %637, %635 : !torch.vtensor<[131072,1],si64>, !torch.vtensor<[128],f32> -> !torch.vtensor<[131072,128],f32>
%int1_89 = torch.constant.int 1
%639 = torch.aten.size.int %581, %int1_89 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.int
%int0_90 = torch.constant.int 0
%640 = torch.aten.add.int %int0_90, %639 : !torch.int, !torch.int -> !torch.int
%int0_91 = torch.constant.int 0
%int0_92 = torch.constant.int 0
%int1_93 = torch.constant.int 1
%641 = torch.aten.slice.Tensor %638, %int0_91, %int0_92, %640, %int1_93 : !torch.vtensor<[131072,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
torch.bind_symbolic_shape %641, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
%int1_94 = torch.constant.int 1
%int0_95 = torch.constant.int 0
%int9223372036854775807_96 = torch.constant.int 9223372036854775807
%int1_97 = torch.constant.int 1
%642 = torch.aten.slice.Tensor %641, %int1_94, %int0_95, %int9223372036854775807_96, %int1_97 : !torch.vtensor<[?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
torch.bind_symbolic_shape %642, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
%int1_98 = torch.constant.int 1
%int0_99 = torch.constant.int 0
%int9223372036854775807_100 = torch.constant.int 9223372036854775807
%int1_101 = torch.constant.int 1
%643 = torch.aten.slice.Tensor %642, %int1_98, %int0_99, %int9223372036854775807_100, %int1_101 : !torch.vtensor<[?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
torch.bind_symbolic_shape %643, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
%int0_102 = torch.constant.int 0
%644 = torch.aten.unsqueeze %643, %int0_102 : !torch.vtensor<[?,128],f32>, !torch.int -> !torch.vtensor<[1,?,128],f32>
torch.bind_symbolic_shape %644, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
%int1_103 = torch.constant.int 1
%int0_104 = torch.constant.int 0
%int9223372036854775807_105 = torch.constant.int 9223372036854775807
%int1_106 = torch.constant.int 1
%645 = torch.aten.slice.Tensor %644, %int1_103, %int0_104, %int9223372036854775807_105, %int1_106 : !torch.vtensor<[1,?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],f32>
torch.bind_symbolic_shape %645, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
%int2_107 = torch.constant.int 2
%int0_108 = torch.constant.int 0
%int9223372036854775807_109 = torch.constant.int 9223372036854775807
%int1_110 = torch.constant.int 1
%646 = torch.aten.slice.Tensor %645, %int2_107, %int0_108, %int9223372036854775807_109, %int1_110 : !torch.vtensor<[1,?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],f32>
torch.bind_symbolic_shape %646, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
%int1_111 = torch.constant.int 1
%int1_112 = torch.constant.int 1
%int1_113 = torch.constant.int 1
%647 = torch.prim.ListConstruct %int1_111, %int1_112, %int1_113 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%648 = torch.aten.repeat %646, %647 : !torch.vtensor<[1,?,128],f32>, !torch.list<int> -> !torch.vtensor<[1,?,128],f32>
torch.bind_symbolic_shape %648, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
%int6_114 = torch.constant.int 6
%649 = torch.prims.convert_element_type %596, %int6_114 : !torch.vtensor<[1,?,8,128],bf16>, !torch.int -> !torch.vtensor<[1,?,8,128],f32>
torch.bind_symbolic_shape %649, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f32>
%650 = torch_c.to_builtin_tensor %649 : !torch.vtensor<[1,?,8,128],f32> -> tensor<1x?x8x128xf32>
%651 = torch_c.to_builtin_tensor %648 : !torch.vtensor<[1,?,128],f32> -> tensor<1x?x128xf32>
%652 = util.call @sharktank_rotary_embedding_1_D_8_128_f32(%650, %651) : (tensor<1x?x8x128xf32>, tensor<1x?x128xf32>) -> tensor<1x?x8x128xf32>
%653 = torch_c.from_builtin_tensor %652 : tensor<1x?x8x128xf32> -> !torch.vtensor<[1,?,8,128],f32>
torch.bind_symbolic_shape %653, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f32>
%int15_115 = torch.constant.int 15
%654 = torch.prims.convert_element_type %653, %int15_115 : !torch.vtensor<[1,?,8,128],f32>, !torch.int -> !torch.vtensor<[1,?,8,128],bf16>
torch.bind_symbolic_shape %654, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
%655 = torch.aten.div.Tensor %654, %8 : !torch.vtensor<[1,?,8,128],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,8,128],bf16>
torch.bind_symbolic_shape %655, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
%float-2.400000e02_116 = torch.constant.float -2.400000e+02
%float2.400000e02_117 = torch.constant.float 2.400000e+02
%656 = torch.aten.clamp %655, %float-2.400000e02_116, %float2.400000e02_117 : !torch.vtensor<[1,?,8,128],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,8,128],bf16>
torch.bind_symbolic_shape %656, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
%int26_118 = torch.constant.int 26
%657 = torch.prims.convert_element_type %656, %int26_118 : !torch.vtensor<[1,?,8,128],bf16>, !torch.int -> !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %657, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
%658 = torch.aten.div.Tensor %598, %8 : !torch.vtensor<[1,?,8,128],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,8,128],bf16>
torch.bind_symbolic_shape %658, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
%float-2.400000e02_119 = torch.constant.float -2.400000e+02
%float2.400000e02_120 = torch.constant.float 2.400000e+02
%659 = torch.aten.clamp %658, %float-2.400000e02_119, %float2.400000e02_120 : !torch.vtensor<[1,?,8,128],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,8,128],bf16>
torch.bind_symbolic_shape %659, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
%int26_121 = torch.constant.int 26
%660 = torch.prims.convert_element_type %659, %int26_121 : !torch.vtensor<[1,?,8,128],bf16>, !torch.int -> !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %660, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
%int0_122 = torch.constant.int 0
%661 = torch.aten.size.int %547, %int0_122 : !torch.vtensor<[?,2097152],f16>, !torch.int -> !torch.int
%int32_123 = torch.constant.int 32
%int2_124 = torch.constant.int 2
%int32_125 = torch.constant.int 32
%int8_126 = torch.constant.int 8
%int128_127 = torch.constant.int 128
%662 = torch.prim.ListConstruct %661, %int32_123, %int2_124, %int32_125, %int8_126, %int128_127 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%663 = torch.aten.view %547, %662 : !torch.vtensor<[?,2097152],f16>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f16>
torch.bind_symbolic_shape %663, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f16>
%int32_128 = torch.constant.int 32
%664 = torch.aten.mul.int %661, %int32_128 : !torch.int, !torch.int -> !torch.int
%int2_129 = torch.constant.int 2
%665 = torch.aten.mul.int %664, %int2_129 : !torch.int, !torch.int -> !torch.int
%int32_130 = torch.constant.int 32
%int8_131 = torch.constant.int 8
%int128_132 = torch.constant.int 128
%666 = torch.prim.ListConstruct %665, %int32_130, %int8_131, %int128_132 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%667 = torch.aten.view %663, %666 : !torch.vtensor<[?,32,2,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f16>
torch.bind_symbolic_shape %667, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f16>
%int64 = torch.constant.int 64
%668 = torch.aten.mul.Scalar %arg2, %int64 : !torch.vtensor<[1,?],si64>, !torch.int -> !torch.vtensor<[1,?],si64>
torch.bind_symbolic_shape %668, [%548], affine_map<()[s0] -> (1, s0)> : !torch.vtensor<[1,?],si64>
%int0_133 = torch.constant.int 0
%int1_134 = torch.constant.int 1
%669 = torch.aten.add.Scalar %668, %int0_133, %int1_134 : !torch.vtensor<[1,?],si64>, !torch.int, !torch.int -> !torch.vtensor<[1,?],si64>
torch.bind_symbolic_shape %669, [%548], affine_map<()[s0] -> (1, s0)> : !torch.vtensor<[1,?],si64>
%int1_135 = torch.constant.int 1
%670 = torch.aten.size.int %arg2, %int1_135 : !torch.vtensor<[1,?],si64>, !torch.int -> !torch.int
%int1_136 = torch.constant.int 1
%int32_137 = torch.constant.int 32
%int8_138 = torch.constant.int 8
%int128_139 = torch.constant.int 128
%671 = torch.prim.ListConstruct %int1_136, %670, %int32_137, %int8_138, %int128_139 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%672 = torch.aten.view %657, %671 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %672, [%548], affine_map<()[s0] -> (1, s0, 32, 8, 128)> : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
%int32_140 = torch.constant.int 32
%int8_141 = torch.constant.int 8
%int128_142 = torch.constant.int 128
%673 = torch.prim.ListConstruct %670, %int32_140, %int8_141, %int128_142 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%674 = torch.aten.view %672, %673 : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %674, [%548], affine_map<()[s0] -> (s0, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
%675 = torch.prim.ListConstruct %670 : (!torch.int) -> !torch.list<int>
%676 = torch.aten.view %669, %675 : !torch.vtensor<[1,?],si64>, !torch.list<int> -> !torch.vtensor<[?],si64>
torch.bind_symbolic_shape %676, [%548], affine_map<()[s0] -> (s0)> : !torch.vtensor<[?],si64>
%677 = torch.prim.ListConstruct %676 : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
%false_143 = torch.constant.bool false
%678 = torch.aten.index_put %667, %677, %674, %false_143 : !torch.vtensor<[?,32,8,128],f16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool -> !torch.vtensor<[?,32,8,128],f16>
torch.bind_symbolic_shape %678, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f16>
%int32_144 = torch.constant.int 32
%int2_145 = torch.constant.int 2
%int32_146 = torch.constant.int 32
%int8_147 = torch.constant.int 8
%int128_148 = torch.constant.int 128
%679 = torch.prim.ListConstruct %661, %int32_144, %int2_145, %int32_146, %int8_147, %int128_148 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%680 = torch.aten.view %678, %679 : !torch.vtensor<[?,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f16>
torch.bind_symbolic_shape %680, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f16>
%int2097152 = torch.constant.int 2097152
%681 = torch.prim.ListConstruct %661, %int2097152 : (!torch.int, !torch.int) -> !torch.list<int>
%682 = torch.aten.view %680, %681 : !torch.vtensor<[?,32,2,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,2097152],f16>
torch.bind_symbolic_shape %682, [%549], affine_map<()[s0] -> (s0, 2097152)> : !torch.vtensor<[?,2097152],f16>
%int32_149 = torch.constant.int 32
%int2_150 = torch.constant.int 2
%int32_151 = torch.constant.int 32
%int8_152 = torch.constant.int 8
%int128_153 = torch.constant.int 128
%683 = torch.prim.ListConstruct %661, %int32_149, %int2_150, %int32_151, %int8_152, %int128_153 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%684 = torch.aten.view %682, %683 : !torch.vtensor<[?,2097152],f16>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f16>
torch.bind_symbolic_shape %684, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f16>
%int32_154 = torch.constant.int 32
%int8_155 = torch.constant.int 8
%int128_156 = torch.constant.int 128
%685 = torch.prim.ListConstruct %665, %int32_154, %int8_155, %int128_156 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%686 = torch.aten.view %684, %685 : !torch.vtensor<[?,32,2,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f16>
torch.bind_symbolic_shape %686, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f16>
%int1_157 = torch.constant.int 1
%int32_158 = torch.constant.int 32
%int8_159 = torch.constant.int 8
%int128_160 = torch.constant.int 128
%687 = torch.prim.ListConstruct %int1_157, %670, %int32_158, %int8_159, %int128_160 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%688 = torch.aten.view %660, %687 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %688, [%548], affine_map<()[s0] -> (1, s0, 32, 8, 128)> : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
%int32_161 = torch.constant.int 32
%int8_162 = torch.constant.int 8
%int128_163 = torch.constant.int 128
%689 = torch.prim.ListConstruct %670, %int32_161, %int8_162, %int128_163 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%690 = torch.aten.view %688, %689 : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %690, [%548], affine_map<()[s0] -> (s0, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
%int1_164 = torch.constant.int 1
%int1_165 = torch.constant.int 1
%691 = torch.aten.add.Scalar %669, %int1_164, %int1_165 : !torch.vtensor<[1,?],si64>, !torch.int, !torch.int -> !torch.vtensor<[1,?],si64>
torch.bind_symbolic_shape %691, [%548], affine_map<()[s0] -> (1, s0)> : !torch.vtensor<[1,?],si64>
%692 = torch.prim.ListConstruct %670 : (!torch.int) -> !torch.list<int>
%693 = torch.aten.view %691, %692 : !torch.vtensor<[1,?],si64>, !torch.list<int> -> !torch.vtensor<[?],si64>
torch.bind_symbolic_shape %693, [%548], affine_map<()[s0] -> (s0)> : !torch.vtensor<[?],si64>
%694 = torch.prim.ListConstruct %693 : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
%false_166 = torch.constant.bool false
%695 = torch.aten.index_put %686, %694, %690, %false_166 : !torch.vtensor<[?,32,8,128],f16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool -> !torch.vtensor<[?,32,8,128],f16>
torch.bind_symbolic_shape %695, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f16>
%int32_167 = torch.constant.int 32
%int2_168 = torch.constant.int 2
%int32_169 = torch.constant.int 32
%int8_170 = torch.constant.int 8
%int128_171 = torch.constant.int 128
%696 = torch.prim.ListConstruct %661, %int32_167, %int2_168, %int32_169, %int8_170, %int128_171 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%697 = torch.aten.view %695, %696 : !torch.vtensor<[?,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f16>
torch.bind_symbolic_shape %697, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f16>
%int2097152_172 = torch.constant.int 2097152
%698 = torch.prim.ListConstruct %661, %int2097152_172 : (!torch.int, !torch.int) -> !torch.list<int>
%699 = torch.aten.view %697, %698 : !torch.vtensor<[?,32,2,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,2097152],f16>
torch.bind_symbolic_shape %699, [%549], affine_map<()[s0] -> (s0, 2097152)> : !torch.vtensor<[?,2097152],f16>
%int-2_173 = torch.constant.int -2
%700 = torch.aten.unsqueeze %657, %int-2_173 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %700, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 1, 128)> : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
%int1_174 = torch.constant.int 1
%int8_175 = torch.constant.int 8
%int4 = torch.constant.int 4
%int128_176 = torch.constant.int 128
%701 = torch.prim.ListConstruct %int1_174, %639, %int8_175, %int4, %int128_176 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%false_177 = torch.constant.bool false
%702 = torch.aten.expand %700, %701, %false_177 : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %702, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
%int0_178 = torch.constant.int 0
%703 = torch.aten.clone %702, %int0_178 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %703, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
%int1_179 = torch.constant.int 1
%int32_180 = torch.constant.int 32
%int128_181 = torch.constant.int 128
%704 = torch.prim.ListConstruct %int1_179, %639, %int32_180, %int128_181 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%705 = torch.aten._unsafe_view %703, %704 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %705, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
%int-2_182 = torch.constant.int -2
%706 = torch.aten.unsqueeze %660, %int-2_182 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %706, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 1, 128)> : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
%int1_183 = torch.constant.int 1
%707 = torch.aten.size.int %591, %int1_183 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.int
%int1_184 = torch.constant.int 1
%int8_185 = torch.constant.int 8
%int4_186 = torch.constant.int 4
%int128_187 = torch.constant.int 128
%708 = torch.prim.ListConstruct %int1_184, %707, %int8_185, %int4_186, %int128_187 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%false_188 = torch.constant.bool false
%709 = torch.aten.expand %706, %708, %false_188 : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %709, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
%int0_189 = torch.constant.int 0
%710 = torch.aten.clone %709, %int0_189 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %710, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
%int1_190 = torch.constant.int 1
%int32_191 = torch.constant.int 32
%int128_192 = torch.constant.int 128
%711 = torch.prim.ListConstruct %int1_190, %707, %int32_191, %int128_192 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%712 = torch.aten._unsafe_view %710, %711 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %712, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
%int6_193 = torch.constant.int 6
%713 = torch.prims.convert_element_type %705, %int6_193 : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,32,128],f32>
torch.bind_symbolic_shape %713, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
%714 = torch.aten.mul.Tensor %713, %8 : !torch.vtensor<[1,?,32,128],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,32,128],f32>
torch.bind_symbolic_shape %714, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
%int15_194 = torch.constant.int 15
%715 = torch.prims.convert_element_type %714, %int15_194 : !torch.vtensor<[1,?,32,128],f32>, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
torch.bind_symbolic_shape %715, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
%int6_195 = torch.constant.int 6
%716 = torch.prims.convert_element_type %712, %int6_195 : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,32,128],f32>
torch.bind_symbolic_shape %716, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
%717 = torch.aten.mul.Tensor %716, %8 : !torch.vtensor<[1,?,32,128],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,32,128],f32>
torch.bind_symbolic_shape %717, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
%int15_196 = torch.constant.int 15
%718 = torch.prims.convert_element_type %717, %int15_196 : !torch.vtensor<[1,?,32,128],f32>, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
torch.bind_symbolic_shape %718, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
%int1_197 = torch.constant.int 1
%int2_198 = torch.constant.int 2
%719 = torch.aten.transpose.int %626, %int1_197, %int2_198 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
torch.bind_symbolic_shape %719, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
%int1_199 = torch.constant.int 1
%int2_200 = torch.constant.int 2
%720 = torch.aten.transpose.int %715, %int1_199, %int2_200 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
torch.bind_symbolic_shape %720, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
%int1_201 = torch.constant.int 1
%int2_202 = torch.constant.int 2
%721 = torch.aten.transpose.int %718, %int1_201, %int2_202 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
torch.bind_symbolic_shape %721, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
%float0.000000e00 = torch.constant.float 0.000000e+00
%true_203 = torch.constant.bool true
%none_204 = torch.constant.none
%none_205 = torch.constant.none
%722:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%719, %720, %721, %float0.000000e00, %true_203, %none_204, %none_205) : (!torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,32,?,128],bf16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,32,?],f32>)
torch.bind_symbolic_shape %722#0, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
%int1_206 = torch.constant.int 1
%int2_207 = torch.constant.int 2
%723 = torch.aten.transpose.int %722#0, %int1_206, %int2_207 : !torch.vtensor<[1,32,?,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
torch.bind_symbolic_shape %723, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
%int1_208 = torch.constant.int 1
%int4096_209 = torch.constant.int 4096
%724 = torch.prim.ListConstruct %int1_208, %611, %int4096_209 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%725 = torch.aten.view %723, %724 : !torch.vtensor<[1,?,32,128],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,4096],bf16>
torch.bind_symbolic_shape %725, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
%726 = torch.aten.div.Tensor %725, %9 : !torch.vtensor<[1,?,4096],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],bf16>
torch.bind_symbolic_shape %726, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
%float-2.400000e02_210 = torch.constant.float -2.400000e+02
%float2.400000e02_211 = torch.constant.float 2.400000e+02
%727 = torch.aten.clamp %726, %float-2.400000e02_210, %float2.400000e02_211 : !torch.vtensor<[1,?,4096],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],bf16>
torch.bind_symbolic_shape %727, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
%int26_212 = torch.constant.int 26
%728 = torch.prims.convert_element_type %727, %int26_212 : !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %728, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
%int-2_213 = torch.constant.int -2
%int-1_214 = torch.constant.int -1
%729 = torch.aten.transpose.int %10, %int-2_213, %int-1_214 : !torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%int4096_215 = torch.constant.int 4096
%730 = torch.prim.ListConstruct %611, %int4096_215 : (!torch.int, !torch.int) -> !torch.list<int>
%731 = torch.aten.view %728, %730 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %731, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
%732 = torch.aten.mm %731, %729 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %732, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
%int1_216 = torch.constant.int 1
%int4096_217 = torch.constant.int 4096
%733 = torch.prim.ListConstruct %int1_216, %611, %int4096_217 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%734 = torch.aten.view %732, %733 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %734, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
%int15_218 = torch.constant.int 15
%735 = torch.prims.convert_element_type %734, %int15_218 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
torch.bind_symbolic_shape %735, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
%int1_219 = torch.constant.int 1
%736 = torch.aten.add.Tensor %551, %735, %int1_219 : !torch.vtensor<[1,?,4096],f16>, !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %736, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%int2_220 = torch.constant.int 2
%737 = torch.aten.pow.Tensor_Scalar %736, %int2_220 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %737, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%int-1_221 = torch.constant.int -1
%738 = torch.prim.ListConstruct %int-1_221 : (!torch.int) -> !torch.list<int>
%true_222 = torch.constant.bool true
%none_223 = torch.constant.none
%739 = torch.aten.mean.dim %737, %738, %true_222, %none_223 : !torch.vtensor<[1,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,?,1],f32>
torch.bind_symbolic_shape %739, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
%float1.000000e-05_224 = torch.constant.float 1.000000e-05
%int1_225 = torch.constant.int 1
%740 = torch.aten.add.Scalar %739, %float1.000000e-05_224, %int1_225 : !torch.vtensor<[1,?,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,?,1],f32>
torch.bind_symbolic_shape %740, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
%741 = torch.aten.rsqrt %740 : !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,1],f32>
torch.bind_symbolic_shape %741, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
%742 = torch.aten.mul.Tensor %736, %741 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %742, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%743 = torch.aten.mul.Tensor %11, %742 : !torch.vtensor<[4096],bf16>, !torch.vtensor<[1,?,4096],f32> -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %743, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%744 = torch.aten.div.Tensor %743, %12 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %744, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%float-2.400000e02_226 = torch.constant.float -2.400000e+02
%float2.400000e02_227 = torch.constant.float 2.400000e+02
%745 = torch.aten.clamp %744, %float-2.400000e02_226, %float2.400000e02_227 : !torch.vtensor<[1,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %745, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%int26_228 = torch.constant.int 26
%746 = torch.prims.convert_element_type %745, %int26_228 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %746, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
%int-2_229 = torch.constant.int -2
%int-1_230 = torch.constant.int -1
%747 = torch.aten.transpose.int %13, %int-2_229, %int-1_230 : !torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
%int4096_231 = torch.constant.int 4096
%748 = torch.prim.ListConstruct %566, %int4096_231 : (!torch.int, !torch.int) -> !torch.list<int>
%749 = torch.aten.view %746, %748 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %749, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
%750 = torch.aten.mm %749, %747 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,14336],f8E4M3FNUZ> -> !torch.vtensor<[?,14336],f8E4M3FNUZ>
torch.bind_symbolic_shape %750, [%548], affine_map<()[s0] -> (s0 * 32, 14336)> : !torch.vtensor<[?,14336],f8E4M3FNUZ>
%int1_232 = torch.constant.int 1
%int14336 = torch.constant.int 14336
%751 = torch.prim.ListConstruct %int1_232, %566, %int14336 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%752 = torch.aten.view %750, %751 : !torch.vtensor<[?,14336],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
torch.bind_symbolic_shape %752, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
%int15_233 = torch.constant.int 15
%753 = torch.prims.convert_element_type %752, %int15_233 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,14336],bf16>
torch.bind_symbolic_shape %753, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
%754 = torch.aten.silu %753 : !torch.vtensor<[1,?,14336],bf16> -> !torch.vtensor<[1,?,14336],bf16>
torch.bind_symbolic_shape %754, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
%755 = torch.aten.div.Tensor %743, %14 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %755, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%float-2.400000e02_234 = torch.constant.float -2.400000e+02
%float2.400000e02_235 = torch.constant.float 2.400000e+02
%756 = torch.aten.clamp %755, %float-2.400000e02_234, %float2.400000e02_235 : !torch.vtensor<[1,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %756, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%int26_236 = torch.constant.int 26
%757 = torch.prims.convert_element_type %756, %int26_236 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %757, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
%int-2_237 = torch.constant.int -2
%int-1_238 = torch.constant.int -1
%758 = torch.aten.transpose.int %15, %int-2_237, %int-1_238 : !torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
%int4096_239 = torch.constant.int 4096
%759 = torch.prim.ListConstruct %566, %int4096_239 : (!torch.int, !torch.int) -> !torch.list<int>
%760 = torch.aten.view %757, %759 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %760, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
%761 = torch.aten.mm %760, %758 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,14336],f8E4M3FNUZ> -> !torch.vtensor<[?,14336],f8E4M3FNUZ>
torch.bind_symbolic_shape %761, [%548], affine_map<()[s0] -> (s0 * 32, 14336)> : !torch.vtensor<[?,14336],f8E4M3FNUZ>
%int1_240 = torch.constant.int 1
%int14336_241 = torch.constant.int 14336
%762 = torch.prim.ListConstruct %int1_240, %566, %int14336_241 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%763 = torch.aten.view %761, %762 : !torch.vtensor<[?,14336],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
torch.bind_symbolic_shape %763, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
%int15_242 = torch.constant.int 15
%764 = torch.prims.convert_element_type %763, %int15_242 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,14336],bf16>
torch.bind_symbolic_shape %764, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
%765 = torch.aten.mul.Tensor %754, %764 : !torch.vtensor<[1,?,14336],bf16>, !torch.vtensor<[1,?,14336],bf16> -> !torch.vtensor<[1,?,14336],bf16>
torch.bind_symbolic_shape %765, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
%766 = torch.aten.div.Tensor %765, %16 : !torch.vtensor<[1,?,14336],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,14336],bf16>
torch.bind_symbolic_shape %766, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
%float-2.400000e02_243 = torch.constant.float -2.400000e+02
%float2.400000e02_244 = torch.constant.float 2.400000e+02
%767 = torch.aten.clamp %766, %float-2.400000e02_243, %float2.400000e02_244 : !torch.vtensor<[1,?,14336],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,14336],bf16>
torch.bind_symbolic_shape %767, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
%int26_245 = torch.constant.int 26
%768 = torch.prims.convert_element_type %767, %int26_245 : !torch.vtensor<[1,?,14336],bf16>, !torch.int -> !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
torch.bind_symbolic_shape %768, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
%int-2_246 = torch.constant.int -2
%int-1_247 = torch.constant.int -1
%769 = torch.aten.transpose.int %17, %int-2_246, %int-1_247 : !torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%int1_248 = torch.constant.int 1
%770 = torch.aten.size.int %752, %int1_248 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.int -> !torch.int
%int14336_249 = torch.constant.int 14336
%771 = torch.prim.ListConstruct %770, %int14336_249 : (!torch.int, !torch.int) -> !torch.list<int>
%772 = torch.aten.view %768, %771 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,14336],f8E4M3FNUZ>
torch.bind_symbolic_shape %772, [%548], affine_map<()[s0] -> (s0 * 32, 14336)> : !torch.vtensor<[?,14336],f8E4M3FNUZ>
%773 = torch.aten.mm %772, %769 : !torch.vtensor<[?,14336],f8E4M3FNUZ>, !torch.vtensor<[14336,4096],f8E4M3FNUZ> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %773, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
%int1_250 = torch.constant.int 1
%int4096_251 = torch.constant.int 4096
%774 = torch.prim.ListConstruct %int1_250, %770, %int4096_251 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%775 = torch.aten.view %773, %774 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %775, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
%int15_252 = torch.constant.int 15
%776 = torch.prims.convert_element_type %775, %int15_252 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
torch.bind_symbolic_shape %776, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
%int1_253 = torch.constant.int 1
%777 = torch.aten.add.Tensor %736, %776, %int1_253 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %777, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%int2_254 = torch.constant.int 2
%778 = torch.aten.pow.Tensor_Scalar %777, %int2_254 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %778, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%int-1_255 = torch.constant.int -1
%779 = torch.prim.ListConstruct %int-1_255 : (!torch.int) -> !torch.list<int>
%true_256 = torch.constant.bool true
%none_257 = torch.constant.none
%780 = torch.aten.mean.dim %778, %779, %true_256, %none_257 : !torch.vtensor<[1,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,?,1],f32>
torch.bind_symbolic_shape %780, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
%float1.000000e-05_258 = torch.constant.float 1.000000e-05
%int1_259 = torch.constant.int 1
%781 = torch.aten.add.Scalar %780, %float1.000000e-05_258, %int1_259 : !torch.vtensor<[1,?,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,?,1],f32>
torch.bind_symbolic_shape %781, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
%782 = torch.aten.rsqrt %781 : !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,1],f32>
torch.bind_symbolic_shape %782, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
%783 = torch.aten.mul.Tensor %777, %782 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %783, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%784 = torch.aten.mul.Tensor %18, %783 : !torch.vtensor<[4096],bf16>, !torch.vtensor<[1,?,4096],f32> -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %784, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%785 = torch.aten.div.Tensor %784, %19 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %785, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%float-2.400000e02_260 = torch.constant.float -2.400000e+02
%float2.400000e02_261 = torch.constant.float 2.400000e+02
%786 = torch.aten.clamp %785, %float-2.400000e02_260, %float2.400000e02_261 : !torch.vtensor<[1,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %786, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%int26_262 = torch.constant.int 26
%787 = torch.prims.convert_element_type %786, %int26_262 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %787, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
%int-2_263 = torch.constant.int -2
%int-1_264 = torch.constant.int -1
%788 = torch.aten.transpose.int %20, %int-2_263, %int-1_264 : !torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%int4096_265 = torch.constant.int 4096
%789 = torch.prim.ListConstruct %566, %int4096_265 : (!torch.int, !torch.int) -> !torch.list<int>
%790 = torch.aten.view %787, %789 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %790, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
%791 = torch.aten.mm %790, %788 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %791, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
%int1_266 = torch.constant.int 1
%int4096_267 = torch.constant.int 4096
%792 = torch.prim.ListConstruct %int1_266, %566, %int4096_267 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%793 = torch.aten.view %791, %792 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %793, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
%int15_268 = torch.constant.int 15
%794 = torch.prims.convert_element_type %793, %int15_268 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
torch.bind_symbolic_shape %794, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
%795 = torch.aten.div.Tensor %784, %21 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %795, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%float-2.400000e02_269 = torch.constant.float -2.400000e+02
%float2.400000e02_270 = torch.constant.float 2.400000e+02
%796 = torch.aten.clamp %795, %float-2.400000e02_269, %float2.400000e02_270 : !torch.vtensor<[1,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %796, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%int26_271 = torch.constant.int 26
%797 = torch.prims.convert_element_type %796, %int26_271 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %797, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
%int-2_272 = torch.constant.int -2
%int-1_273 = torch.constant.int -1
%798 = torch.aten.transpose.int %22, %int-2_272, %int-1_273 : !torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,1024],f8E4M3FNUZ>
%int4096_274 = torch.constant.int 4096
%799 = torch.prim.ListConstruct %566, %int4096_274 : (!torch.int, !torch.int) -> !torch.list<int>
%800 = torch.aten.view %797, %799 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %800, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
%801 = torch.aten.mm %800, %798 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,1024],f8E4M3FNUZ> -> !torch.vtensor<[?,1024],f8E4M3FNUZ>
torch.bind_symbolic_shape %801, [%548], affine_map<()[s0] -> (s0 * 32, 1024)> : !torch.vtensor<[?,1024],f8E4M3FNUZ>
%int1_275 = torch.constant.int 1
%int1024_276 = torch.constant.int 1024
%802 = torch.prim.ListConstruct %int1_275, %566, %int1024_276 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%803 = torch.aten.view %801, %802 : !torch.vtensor<[?,1024],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
torch.bind_symbolic_shape %803, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
%int15_277 = torch.constant.int 15
%804 = torch.prims.convert_element_type %803, %int15_277 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,1024],bf16>
torch.bind_symbolic_shape %804, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],bf16>
%805 = torch.aten.div.Tensor %784, %23 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %805, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%float-2.400000e02_278 = torch.constant.float -2.400000e+02
%float2.400000e02_279 = torch.constant.float 2.400000e+02
%806 = torch.aten.clamp %805, %float-2.400000e02_278, %float2.400000e02_279 : !torch.vtensor<[1,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %806, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%int26_280 = torch.constant.int 26
%807 = torch.prims.convert_element_type %806, %int26_280 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %807, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
%int-2_281 = torch.constant.int -2
%int-1_282 = torch.constant.int -1
%808 = torch.aten.transpose.int %24, %int-2_281, %int-1_282 : !torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,1024],f8E4M3FNUZ>
%int4096_283 = torch.constant.int 4096
%809 = torch.prim.ListConstruct %566, %int4096_283 : (!torch.int, !torch.int) -> !torch.list<int>
%810 = torch.aten.view %807, %809 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %810, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
%811 = torch.aten.mm %810, %808 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,1024],f8E4M3FNUZ> -> !torch.vtensor<[?,1024],f8E4M3FNUZ>
torch.bind_symbolic_shape %811, [%548], affine_map<()[s0] -> (s0 * 32, 1024)> : !torch.vtensor<[?,1024],f8E4M3FNUZ>
%int1_284 = torch.constant.int 1
%int1024_285 = torch.constant.int 1024
%812 = torch.prim.ListConstruct %int1_284, %566, %int1024_285 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%813 = torch.aten.view %811, %812 : !torch.vtensor<[?,1024],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
torch.bind_symbolic_shape %813, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
%int15_286 = torch.constant.int 15
%814 = torch.prims.convert_element_type %813, %int15_286 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,1024],bf16>
torch.bind_symbolic_shape %814, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],bf16>
%int1_287 = torch.constant.int 1
%int32_288 = torch.constant.int 32
%int128_289 = torch.constant.int 128
%815 = torch.prim.ListConstruct %int1_287, %566, %int32_288, %int128_289 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%816 = torch.aten.view %794, %815 : !torch.vtensor<[1,?,4096],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,32,128],bf16>
torch.bind_symbolic_shape %816, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
%int1_290 = torch.constant.int 1
%int8_291 = torch.constant.int 8
%int128_292 = torch.constant.int 128
%817 = torch.prim.ListConstruct %int1_290, %566, %int8_291, %int128_292 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%818 = torch.aten.view %804, %817 : !torch.vtensor<[1,?,1024],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,8,128],bf16>
torch.bind_symbolic_shape %818, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
%int1_293 = torch.constant.int 1
%int8_294 = torch.constant.int 8
%int128_295 = torch.constant.int 128
%819 = torch.prim.ListConstruct %int1_293, %566, %int8_294, %int128_295 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%820 = torch.aten.view %814, %819 : !torch.vtensor<[1,?,1024],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,8,128],bf16>
torch.bind_symbolic_shape %820, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
%int131072_296 = torch.constant.int 131072
%none_297 = torch.constant.none
%none_298 = torch.constant.none
%cpu_299 = torch.constant.device "cpu"
%false_300 = torch.constant.bool false
%821 = torch.aten.arange %int131072_296, %none_297, %none_298, %cpu_299, %false_300 : !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[131072],si64>
%int0_301 = torch.constant.int 0
%int128_302 = torch.constant.int 128
%none_303 = torch.constant.none
%none_304 = torch.constant.none
%cpu_305 = torch.constant.device "cpu"
%false_306 = torch.constant.bool false
%822 = torch.aten.arange.start %int0_301, %int128_302, %none_303, %none_304, %cpu_305, %false_306 : !torch.int, !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[128],si64>
%int2_307 = torch.constant.int 2
%823 = torch.aten.floor_divide.Scalar %822, %int2_307 : !torch.vtensor<[128],si64>, !torch.int -> !torch.vtensor<[128],si64>
%int6_308 = torch.constant.int 6
%824 = torch.prims.convert_element_type %823, %int6_308 : !torch.vtensor<[128],si64>, !torch.int -> !torch.vtensor<[128],f32>
%int128_309 = torch.constant.int 128
%825 = torch.aten.div.Scalar %824, %int128_309 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[128],f32>
%float2.000000e00_310 = torch.constant.float 2.000000e+00
%826 = torch.aten.mul.Scalar %825, %float2.000000e00_310 : !torch.vtensor<[128],f32>, !torch.float -> !torch.vtensor<[128],f32>
%float5.000000e05_311 = torch.constant.float 5.000000e+05
%827 = torch.aten.pow.Scalar %float5.000000e05_311, %826 : !torch.float, !torch.vtensor<[128],f32> -> !torch.vtensor<[128],f32>
%828 = torch.aten.reciprocal %827 : !torch.vtensor<[128],f32> -> !torch.vtensor<[128],f32>
%float1.000000e00_312 = torch.constant.float 1.000000e+00
%829 = torch.aten.mul.Scalar %828, %float1.000000e00_312 : !torch.vtensor<[128],f32>, !torch.float -> !torch.vtensor<[128],f32>
%int131072_313 = torch.constant.int 131072
%int1_314 = torch.constant.int 1
%830 = torch.prim.ListConstruct %int131072_313, %int1_314 : (!torch.int, !torch.int) -> !torch.list<int>
%831 = torch.aten.view %821, %830 : !torch.vtensor<[131072],si64>, !torch.list<int> -> !torch.vtensor<[131072,1],si64>
%832 = torch.aten.mul.Tensor %831, %829 : !torch.vtensor<[131072,1],si64>, !torch.vtensor<[128],f32> -> !torch.vtensor<[131072,128],f32>
%int1_315 = torch.constant.int 1
%833 = torch.aten.size.int %793, %int1_315 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.int
%int0_316 = torch.constant.int 0
%834 = torch.aten.add.int %int0_316, %833 : !torch.int, !torch.int -> !torch.int
%int0_317 = torch.constant.int 0
%int0_318 = torch.constant.int 0
%int1_319 = torch.constant.int 1
%835 = torch.aten.slice.Tensor %832, %int0_317, %int0_318, %834, %int1_319 : !torch.vtensor<[131072,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
torch.bind_symbolic_shape %835, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
%int1_320 = torch.constant.int 1
%int0_321 = torch.constant.int 0
%int9223372036854775807_322 = torch.constant.int 9223372036854775807
%int1_323 = torch.constant.int 1
%836 = torch.aten.slice.Tensor %835, %int1_320, %int0_321, %int9223372036854775807_322, %int1_323 : !torch.vtensor<[?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
torch.bind_symbolic_shape %836, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
%int1_324 = torch.constant.int 1
%int0_325 = torch.constant.int 0
%int9223372036854775807_326 = torch.constant.int 9223372036854775807
%int1_327 = torch.constant.int 1
%837 = torch.aten.slice.Tensor %836, %int1_324, %int0_325, %int9223372036854775807_326, %int1_327 : !torch.vtensor<[?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
torch.bind_symbolic_shape %837, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
%int0_328 = torch.constant.int 0
%838 = torch.aten.unsqueeze %837, %int0_328 : !torch.vtensor<[?,128],f32>, !torch.int -> !torch.vtensor<[1,?,128],f32>
torch.bind_symbolic_shape %838, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
%int1_329 = torch.constant.int 1
%int0_330 = torch.constant.int 0
%int9223372036854775807_331 = torch.constant.int 9223372036854775807
%int1_332 = torch.constant.int 1
%839 = torch.aten.slice.Tensor %838, %int1_329, %int0_330, %int9223372036854775807_331, %int1_332 : !torch.vtensor<[1,?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],f32>
torch.bind_symbolic_shape %839, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
%int2_333 = torch.constant.int 2
%int0_334 = torch.constant.int 0
%int9223372036854775807_335 = torch.constant.int 9223372036854775807
%int1_336 = torch.constant.int 1
%840 = torch.aten.slice.Tensor %839, %int2_333, %int0_334, %int9223372036854775807_335, %int1_336 : !torch.vtensor<[1,?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],f32>
torch.bind_symbolic_shape %840, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
%int1_337 = torch.constant.int 1
%int1_338 = torch.constant.int 1
%int1_339 = torch.constant.int 1
%841 = torch.prim.ListConstruct %int1_337, %int1_338, %int1_339 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%842 = torch.aten.repeat %840, %841 : !torch.vtensor<[1,?,128],f32>, !torch.list<int> -> !torch.vtensor<[1,?,128],f32>
torch.bind_symbolic_shape %842, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
%int6_340 = torch.constant.int 6
%843 = torch.prims.convert_element_type %816, %int6_340 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int -> !torch.vtensor<[1,?,32,128],f32>
torch.bind_symbolic_shape %843, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
%844 = torch_c.to_builtin_tensor %843 : !torch.vtensor<[1,?,32,128],f32> -> tensor<1x?x32x128xf32>
%845 = torch_c.to_builtin_tensor %842 : !torch.vtensor<[1,?,128],f32> -> tensor<1x?x128xf32>
%846 = util.call @sharktank_rotary_embedding_1_D_32_128_f32(%844, %845) : (tensor<1x?x32x128xf32>, tensor<1x?x128xf32>) -> tensor<1x?x32x128xf32>
%847 = torch_c.from_builtin_tensor %846 : tensor<1x?x32x128xf32> -> !torch.vtensor<[1,?,32,128],f32>
torch.bind_symbolic_shape %847, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
%int15_341 = torch.constant.int 15
%848 = torch.prims.convert_element_type %847, %int15_341 : !torch.vtensor<[1,?,32,128],f32>, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
torch.bind_symbolic_shape %848, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
%int131072_342 = torch.constant.int 131072
%none_343 = torch.constant.none
%none_344 = torch.constant.none
%cpu_345 = torch.constant.device "cpu"
%false_346 = torch.constant.bool false
%849 = torch.aten.arange %int131072_342, %none_343, %none_344, %cpu_345, %false_346 : !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[131072],si64>
%int0_347 = torch.constant.int 0
%int128_348 = torch.constant.int 128
%none_349 = torch.constant.none
%none_350 = torch.constant.none
%cpu_351 = torch.constant.device "cpu"
%false_352 = torch.constant.bool false
%850 = torch.aten.arange.start %int0_347, %int128_348, %none_349, %none_350, %cpu_351, %false_352 : !torch.int, !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[128],si64>
%int2_353 = torch.constant.int 2
%851 = torch.aten.floor_divide.Scalar %850, %int2_353 : !torch.vtensor<[128],si64>, !torch.int -> !torch.vtensor<[128],si64>
%int6_354 = torch.constant.int 6
%852 = torch.prims.convert_element_type %851, %int6_354 : !torch.vtensor<[128],si64>, !torch.int -> !torch.vtensor<[128],f32>
%int128_355 = torch.constant.int 128
%853 = torch.aten.div.Scalar %852, %int128_355 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[128],f32>
%float2.000000e00_356 = torch.constant.float 2.000000e+00
%854 = torch.aten.mul.Scalar %853, %float2.000000e00_356 : !torch.vtensor<[128],f32>, !torch.float -> !torch.vtensor<[128],f32>
%float5.000000e05_357 = torch.constant.float 5.000000e+05
%855 = torch.aten.pow.Scalar %float5.000000e05_357, %854 : !torch.float, !torch.vtensor<[128],f32> -> !torch.vtensor<[128],f32>
%856 = torch.aten.reciprocal %855 : !torch.vtensor<[128],f32> -> !torch.vtensor<[128],f32>
%float1.000000e00_358 = torch.constant.float 1.000000e+00
%857 = torch.aten.mul.Scalar %856, %float1.000000e00_358 : !torch.vtensor<[128],f32>, !torch.float -> !torch.vtensor<[128],f32>
%int131072_359 = torch.constant.int 131072
%int1_360 = torch.constant.int 1
%858 = torch.prim.ListConstruct %int131072_359, %int1_360 : (!torch.int, !torch.int) -> !torch.list<int>
%859 = torch.aten.view %849, %858 : !torch.vtensor<[131072],si64>, !torch.list<int> -> !torch.vtensor<[131072,1],si64>
%860 = torch.aten.mul.Tensor %859, %857 : !torch.vtensor<[131072,1],si64>, !torch.vtensor<[128],f32> -> !torch.vtensor<[131072,128],f32>
%int1_361 = torch.constant.int 1
%861 = torch.aten.size.int %803, %int1_361 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.int
%int0_362 = torch.constant.int 0
%862 = torch.aten.add.int %int0_362, %861 : !torch.int, !torch.int -> !torch.int
%int0_363 = torch.constant.int 0
%int0_364 = torch.constant.int 0
%int1_365 = torch.constant.int 1
%863 = torch.aten.slice.Tensor %860, %int0_363, %int0_364, %862, %int1_365 : !torch.vtensor<[131072,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
torch.bind_symbolic_shape %863, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
%int1_366 = torch.constant.int 1
%int0_367 = torch.constant.int 0
%int9223372036854775807_368 = torch.constant.int 9223372036854775807
%int1_369 = torch.constant.int 1
%864 = torch.aten.slice.Tensor %863, %int1_366, %int0_367, %int9223372036854775807_368, %int1_369 : !torch.vtensor<[?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
torch.bind_symbolic_shape %864, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
%int1_370 = torch.constant.int 1
%int0_371 = torch.constant.int 0
%int9223372036854775807_372 = torch.constant.int 9223372036854775807
%int1_373 = torch.constant.int 1
%865 = torch.aten.slice.Tensor %864, %int1_370, %int0_371, %int9223372036854775807_372, %int1_373 : !torch.vtensor<[?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
torch.bind_symbolic_shape %865, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
%int0_374 = torch.constant.int 0
%866 = torch.aten.unsqueeze %865, %int0_374 : !torch.vtensor<[?,128],f32>, !torch.int -> !torch.vtensor<[1,?,128],f32>
torch.bind_symbolic_shape %866, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
%int1_375 = torch.constant.int 1
%int0_376 = torch.constant.int 0
%int9223372036854775807_377 = torch.constant.int 9223372036854775807
%int1_378 = torch.constant.int 1
%867 = torch.aten.slice.Tensor %866, %int1_375, %int0_376, %int9223372036854775807_377, %int1_378 : !torch.vtensor<[1,?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],f32>
torch.bind_symbolic_shape %867, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
%int2_379 = torch.constant.int 2
%int0_380 = torch.constant.int 0
%int9223372036854775807_381 = torch.constant.int 9223372036854775807
%int1_382 = torch.constant.int 1
%868 = torch.aten.slice.Tensor %867, %int2_379, %int0_380, %int9223372036854775807_381, %int1_382 : !torch.vtensor<[1,?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],f32>
torch.bind_symbolic_shape %868, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
%int1_383 = torch.constant.int 1
%int1_384 = torch.constant.int 1
%int1_385 = torch.constant.int 1
%869 = torch.prim.ListConstruct %int1_383, %int1_384, %int1_385 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%870 = torch.aten.repeat %868, %869 : !torch.vtensor<[1,?,128],f32>, !torch.list<int> -> !torch.vtensor<[1,?,128],f32>
torch.bind_symbolic_shape %870, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
%int6_386 = torch.constant.int 6
%871 = torch.prims.convert_element_type %818, %int6_386 : !torch.vtensor<[1,?,8,128],bf16>, !torch.int -> !torch.vtensor<[1,?,8,128],f32>
torch.bind_symbolic_shape %871, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f32>
%872 = torch_c.to_builtin_tensor %871 : !torch.vtensor<[1,?,8,128],f32> -> tensor<1x?x8x128xf32>
%873 = torch_c.to_builtin_tensor %870 : !torch.vtensor<[1,?,128],f32> -> tensor<1x?x128xf32>
%874 = util.call @sharktank_rotary_embedding_1_D_8_128_f32(%872, %873) : (tensor<1x?x8x128xf32>, tensor<1x?x128xf32>) -> tensor<1x?x8x128xf32>
%875 = torch_c.from_builtin_tensor %874 : tensor<1x?x8x128xf32> -> !torch.vtensor<[1,?,8,128],f32>
torch.bind_symbolic_shape %875, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f32>
%int15_387 = torch.constant.int 15
%876 = torch.prims.convert_element_type %875, %int15_387 : !torch.vtensor<[1,?,8,128],f32>, !torch.int -> !torch.vtensor<[1,?,8,128],bf16>
torch.bind_symbolic_shape %876, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
%877 = torch.aten.div.Tensor %876, %25 : !torch.vtensor<[1,?,8,128],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,8,128],bf16>
torch.bind_symbolic_shape %877, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
%float-2.400000e02_388 = torch.constant.float -2.400000e+02
%float2.400000e02_389 = torch.constant.float 2.400000e+02
%878 = torch.aten.clamp %877, %float-2.400000e02_388, %float2.400000e02_389 : !torch.vtensor<[1,?,8,128],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,8,128],bf16>
torch.bind_symbolic_shape %878, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
%int26_390 = torch.constant.int 26
%879 = torch.prims.convert_element_type %878, %int26_390 : !torch.vtensor<[1,?,8,128],bf16>, !torch.int -> !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %879, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
%880 = torch.aten.div.Tensor %820, %25 : !torch.vtensor<[1,?,8,128],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,8,128],bf16>
torch.bind_symbolic_shape %880, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
%float-2.400000e02_391 = torch.constant.float -2.400000e+02
%float2.400000e02_392 = torch.constant.float 2.400000e+02
%881 = torch.aten.clamp %880, %float-2.400000e02_391, %float2.400000e02_392 : !torch.vtensor<[1,?,8,128],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,8,128],bf16>
torch.bind_symbolic_shape %881, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
%int26_393 = torch.constant.int 26
%882 = torch.prims.convert_element_type %881, %int26_393 : !torch.vtensor<[1,?,8,128],bf16>, !torch.int -> !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %882, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
%int64_394 = torch.constant.int 64
%883 = torch.aten.mul.Scalar %arg2, %int64_394 : !torch.vtensor<[1,?],si64>, !torch.int -> !torch.vtensor<[1,?],si64>
torch.bind_symbolic_shape %883, [%548], affine_map<()[s0] -> (1, s0)> : !torch.vtensor<[1,?],si64>
%int2_395 = torch.constant.int 2
%int1_396 = torch.constant.int 1
%884 = torch.aten.add.Scalar %883, %int2_395, %int1_396 : !torch.vtensor<[1,?],si64>, !torch.int, !torch.int -> !torch.vtensor<[1,?],si64>
torch.bind_symbolic_shape %884, [%548], affine_map<()[s0] -> (1, s0)> : !torch.vtensor<[1,?],si64>
%int1_397 = torch.constant.int 1
%int32_398 = torch.constant.int 32
%int8_399 = torch.constant.int 8
%int128_400 = torch.constant.int 128
%885 = torch.prim.ListConstruct %int1_397, %670, %int32_398, %int8_399, %int128_400 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%886 = torch.aten.view %879, %885 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %886, [%548], affine_map<()[s0] -> (1, s0, 32, 8, 128)> : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
%int32_401 = torch.constant.int 32
%int8_402 = torch.constant.int 8
%int128_403 = torch.constant.int 128
%887 = torch.prim.ListConstruct %670, %int32_401, %int8_402, %int128_403 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%888 = torch.aten.view %886, %887 : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %888, [%548], affine_map<()[s0] -> (s0, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
%889 = torch.prim.ListConstruct %670 : (!torch.int) -> !torch.list<int>
%890 = torch.aten.view %884, %889 : !torch.vtensor<[1,?],si64>, !torch.list<int> -> !torch.vtensor<[?],si64>
torch.bind_symbolic_shape %890, [%548], affine_map<()[s0] -> (s0)> : !torch.vtensor<[?],si64>
%int32_404 = torch.constant.int 32
%int2_405 = torch.constant.int 2
%int32_406 = torch.constant.int 32
%int8_407 = torch.constant.int 8
%int128_408 = torch.constant.int 128
%891 = torch.prim.ListConstruct %661, %int32_404, %int2_405, %int32_406, %int8_407, %int128_408 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%892 = torch.aten.view %699, %891 : !torch.vtensor<[?,2097152],f16>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f16>
torch.bind_symbolic_shape %892, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f16>
%int32_409 = torch.constant.int 32
%893 = torch.aten.mul.int %661, %int32_409 : !torch.int, !torch.int -> !torch.int
%int2_410 = torch.constant.int 2
%894 = torch.aten.mul.int %893, %int2_410 : !torch.int, !torch.int -> !torch.int
%int32_411 = torch.constant.int 32
%int8_412 = torch.constant.int 8
%int128_413 = torch.constant.int 128
%895 = torch.prim.ListConstruct %894, %int32_411, %int8_412, %int128_413 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%896 = torch.aten.view %892, %895 : !torch.vtensor<[?,32,2,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f16>
torch.bind_symbolic_shape %896, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f16>
%897 = torch.prim.ListConstruct %890 : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
%false_414 = torch.constant.bool false
%898 = torch.aten.index_put %896, %897, %888, %false_414 : !torch.vtensor<[?,32,8,128],f16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool -> !torch.vtensor<[?,32,8,128],f16>
torch.bind_symbolic_shape %898, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f16>
%int32_415 = torch.constant.int 32
%int2_416 = torch.constant.int 2
%int32_417 = torch.constant.int 32
%int8_418 = torch.constant.int 8
%int128_419 = torch.constant.int 128
%899 = torch.prim.ListConstruct %661, %int32_415, %int2_416, %int32_417, %int8_418, %int128_419 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%900 = torch.aten.view %898, %899 : !torch.vtensor<[?,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f16>
torch.bind_symbolic_shape %900, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f16>
%int2097152_420 = torch.constant.int 2097152
%901 = torch.prim.ListConstruct %661, %int2097152_420 : (!torch.int, !torch.int) -> !torch.list<int>
%902 = torch.aten.view %900, %901 : !torch.vtensor<[?,32,2,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,2097152],f16>
torch.bind_symbolic_shape %902, [%549], affine_map<()[s0] -> (s0, 2097152)> : !torch.vtensor<[?,2097152],f16>
%int32_421 = torch.constant.int 32
%int2_422 = torch.constant.int 2
%int32_423 = torch.constant.int 32
%int8_424 = torch.constant.int 8
%int128_425 = torch.constant.int 128
%903 = torch.prim.ListConstruct %661, %int32_421, %int2_422, %int32_423, %int8_424, %int128_425 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%904 = torch.aten.view %902, %903 : !torch.vtensor<[?,2097152],f16>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f16>
torch.bind_symbolic_shape %904, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f16>
%int32_426 = torch.constant.int 32
%int8_427 = torch.constant.int 8
%int128_428 = torch.constant.int 128
%905 = torch.prim.ListConstruct %894, %int32_426, %int8_427, %int128_428 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%906 = torch.aten.view %904, %905 : !torch.vtensor<[?,32,2,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f16>
torch.bind_symbolic_shape %906, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f16>
%int1_429 = torch.constant.int 1
%int32_430 = torch.constant.int 32
%int8_431 = torch.constant.int 8
%int128_432 = torch.constant.int 128
%907 = torch.prim.ListConstruct %int1_429, %670, %int32_430, %int8_431, %int128_432 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%908 = torch.aten.view %882, %907 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %908, [%548], affine_map<()[s0] -> (1, s0, 32, 8, 128)> : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
%int32_433 = torch.constant.int 32
%int8_434 = torch.constant.int 8
%int128_435 = torch.constant.int 128
%909 = torch.prim.ListConstruct %670, %int32_433, %int8_434, %int128_435 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%910 = torch.aten.view %908, %909 : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %910, [%548], affine_map<()[s0] -> (s0, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
%int1_436 = torch.constant.int 1
%int1_437 = torch.constant.int 1
%911 = torch.aten.add.Scalar %884, %int1_436, %int1_437 : !torch.vtensor<[1,?],si64>, !torch.int, !torch.int -> !torch.vtensor<[1,?],si64>
torch.bind_symbolic_shape %911, [%548], affine_map<()[s0] -> (1, s0)> : !torch.vtensor<[1,?],si64>
%912 = torch.prim.ListConstruct %670 : (!torch.int) -> !torch.list<int>
%913 = torch.aten.view %911, %912 : !torch.vtensor<[1,?],si64>, !torch.list<int> -> !torch.vtensor<[?],si64>
torch.bind_symbolic_shape %913, [%548], affine_map<()[s0] -> (s0)> : !torch.vtensor<[?],si64>
%914 = torch.prim.ListConstruct %913 : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
%false_438 = torch.constant.bool false
%915 = torch.aten.index_put %906, %914, %910, %false_438 : !torch.vtensor<[?,32,8,128],f16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool -> !torch.vtensor<[?,32,8,128],f16>
torch.bind_symbolic_shape %915, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f16>
%int32_439 = torch.constant.int 32
%int2_440 = torch.constant.int 2
%int32_441 = torch.constant.int 32
%int8_442 = torch.constant.int 8
%int128_443 = torch.constant.int 128
%916 = torch.prim.ListConstruct %661, %int32_439, %int2_440, %int32_441, %int8_442, %int128_443 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%917 = torch.aten.view %915, %916 : !torch.vtensor<[?,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f16>
torch.bind_symbolic_shape %917, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f16>
%int2097152_444 = torch.constant.int 2097152
%918 = torch.prim.ListConstruct %661, %int2097152_444 : (!torch.int, !torch.int) -> !torch.list<int>
%919 = torch.aten.view %917, %918 : !torch.vtensor<[?,32,2,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,2097152],f16>
torch.bind_symbolic_shape %919, [%549], affine_map<()[s0] -> (s0, 2097152)> : !torch.vtensor<[?,2097152],f16>
%int-2_445 = torch.constant.int -2
%920 = torch.aten.unsqueeze %879, %int-2_445 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %920, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 1, 128)> : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
%int1_446 = torch.constant.int 1
%int8_447 = torch.constant.int 8
%int4_448 = torch.constant.int 4
%int128_449 = torch.constant.int 128
%921 = torch.prim.ListConstruct %int1_446, %861, %int8_447, %int4_448, %int128_449 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%false_450 = torch.constant.bool false
%922 = torch.aten.expand %920, %921, %false_450 : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %922, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
%int0_451 = torch.constant.int 0
%923 = torch.aten.clone %922, %int0_451 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %923, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
%int1_452 = torch.constant.int 1
%int32_453 = torch.constant.int 32
%int128_454 = torch.constant.int 128
%924 = torch.prim.ListConstruct %int1_452, %861, %int32_453, %int128_454 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%925 = torch.aten._unsafe_view %923, %924 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %925, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
%int-2_455 = torch.constant.int -2
%926 = torch.aten.unsqueeze %882, %int-2_455 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %926, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 1, 128)> : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
%int1_456 = torch.constant.int 1
%927 = torch.aten.size.int %813, %int1_456 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.int
%int1_457 = torch.constant.int 1
%int8_458 = torch.constant.int 8
%int4_459 = torch.constant.int 4
%int128_460 = torch.constant.int 128
%928 = torch.prim.ListConstruct %int1_457, %927, %int8_458, %int4_459, %int128_460 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%false_461 = torch.constant.bool false
%929 = torch.aten.expand %926, %928, %false_461 : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %929, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
%int0_462 = torch.constant.int 0
%930 = torch.aten.clone %929, %int0_462 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %930, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
%int1_463 = torch.constant.int 1
%int32_464 = torch.constant.int 32
%int128_465 = torch.constant.int 128
%931 = torch.prim.ListConstruct %int1_463, %927, %int32_464, %int128_465 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%932 = torch.aten._unsafe_view %930, %931 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %932, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
%int6_466 = torch.constant.int 6
%933 = torch.prims.convert_element_type %925, %int6_466 : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,32,128],f32>
torch.bind_symbolic_shape %933, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
%934 = torch.aten.mul.Tensor %933, %25 : !torch.vtensor<[1,?,32,128],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,32,128],f32>
torch.bind_symbolic_shape %934, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
%int15_467 = torch.constant.int 15
%935 = torch.prims.convert_element_type %934, %int15_467 : !torch.vtensor<[1,?,32,128],f32>, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
torch.bind_symbolic_shape %935, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
%int6_468 = torch.constant.int 6
%936 = torch.prims.convert_element_type %932, %int6_468 : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,32,128],f32>
torch.bind_symbolic_shape %936, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
%937 = torch.aten.mul.Tensor %936, %25 : !torch.vtensor<[1,?,32,128],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,32,128],f32>
torch.bind_symbolic_shape %937, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
%int15_469 = torch.constant.int 15
%938 = torch.prims.convert_element_type %937, %int15_469 : !torch.vtensor<[1,?,32,128],f32>, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
torch.bind_symbolic_shape %938, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
%int1_470 = torch.constant.int 1
%int2_471 = torch.constant.int 2
%939 = torch.aten.transpose.int %848, %int1_470, %int2_471 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
torch.bind_symbolic_shape %939, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
%int1_472 = torch.constant.int 1
%int2_473 = torch.constant.int 2
%940 = torch.aten.transpose.int %935, %int1_472, %int2_473 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
torch.bind_symbolic_shape %940, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
%int1_474 = torch.constant.int 1
%int2_475 = torch.constant.int 2
%941 = torch.aten.transpose.int %938, %int1_474, %int2_475 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
torch.bind_symbolic_shape %941, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
%float0.000000e00_476 = torch.constant.float 0.000000e+00
%true_477 = torch.constant.bool true
%none_478 = torch.constant.none
%none_479 = torch.constant.none
%942:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%939, %940, %941, %float0.000000e00_476, %true_477, %none_478, %none_479) : (!torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,32,?,128],bf16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,32,?],f32>)
torch.bind_symbolic_shape %942#0, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
%int1_480 = torch.constant.int 1
%int2_481 = torch.constant.int 2
%943 = torch.aten.transpose.int %942#0, %int1_480, %int2_481 : !torch.vtensor<[1,32,?,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
torch.bind_symbolic_shape %943, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
%int1_482 = torch.constant.int 1
%int4096_483 = torch.constant.int 4096
%944 = torch.prim.ListConstruct %int1_482, %833, %int4096_483 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%945 = torch.aten.view %943, %944 : !torch.vtensor<[1,?,32,128],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,4096],bf16>
torch.bind_symbolic_shape %945, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
%946 = torch.aten.div.Tensor %945, %26 : !torch.vtensor<[1,?,4096],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],bf16>
torch.bind_symbolic_shape %946, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
%float-2.400000e02_484 = torch.constant.float -2.400000e+02
%float2.400000e02_485 = torch.constant.float 2.400000e+02
%947 = torch.aten.clamp %946, %float-2.400000e02_484, %float2.400000e02_485 : !torch.vtensor<[1,?,4096],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],bf16>
torch.bind_symbolic_shape %947, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
%int26_486 = torch.constant.int 26
%948 = torch.prims.convert_element_type %947, %int26_486 : !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %948, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
%int-2_487 = torch.constant.int -2
%int-1_488 = torch.constant.int -1
%949 = torch.aten.transpose.int %27, %int-2_487, %int-1_488 : !torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%int4096_489 = torch.constant.int 4096
%950 = torch.prim.ListConstruct %833, %int4096_489 : (!torch.int, !torch.int) -> !torch.list<int>
%951 = torch.aten.view %948, %950 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %951, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
%952 = torch.aten.mm %951, %949 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %952, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
%int1_490 = torch.constant.int 1
%int4096_491 = torch.constant.int 4096
%953 = torch.prim.ListConstruct %int1_490, %833, %int4096_491 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%954 = torch.aten.view %952, %953 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %954, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
%int15_492 = torch.constant.int 15
%955 = torch.prims.convert_element_type %954, %int15_492 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
torch.bind_symbolic_shape %955, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
%int1_493 = torch.constant.int 1
%956 = torch.aten.add.Tensor %777, %955, %int1_493 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %956, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%int2_494 = torch.constant.int 2
%957 = torch.aten.pow.Tensor_Scalar %956, %int2_494 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %957, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%int-1_495 = torch.constant.int -1
%958 = torch.prim.ListConstruct %int-1_495 : (!torch.int) -> !torch.list<int>
%true_496 = torch.constant.bool true
%none_497 = torch.constant.none
%959 = torch.aten.mean.dim %957, %958, %true_496, %none_497 : !torch.vtensor<[1,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,?,1],f32>
torch.bind_symbolic_shape %959, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
%float1.000000e-05_498 = torch.constant.float 1.000000e-05
%int1_499 = torch.constant.int 1
%960 = torch.aten.add.Scalar %959, %float1.000000e-05_498, %int1_499 : !torch.vtensor<[1,?,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,?,1],f32>
torch.bind_symbolic_shape %960, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
%961 = torch.aten.rsqrt %960 : !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,1],f32>
torch.bind_symbolic_shape %961, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
%962 = torch.aten.mul.Tensor %956, %961 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %962, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%963 = torch.aten.mul.Tensor %28, %962 : !torch.vtensor<[4096],bf16>, !torch.vtensor<[1,?,4096],f32> -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %963, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%964 = torch.aten.div.Tensor %963, %29 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %964, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%float-2.400000e02_500 = torch.constant.float -2.400000e+02
%float2.400000e02_501 = torch.constant.float 2.400000e+02
%965 = torch.aten.clamp %964, %float-2.400000e02_500, %float2.400000e02_501 : !torch.vtensor<[1,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %965, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%int26_502 = torch.constant.int 26
%966 = torch.prims.convert_element_type %965, %int26_502 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %966, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
%int-2_503 = torch.constant.int -2
%int-1_504 = torch.constant.int -1
%967 = torch.aten.transpose.int %30, %int-2_503, %int-1_504 : !torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
%int4096_505 = torch.constant.int 4096
%968 = torch.prim.ListConstruct %566, %int4096_505 : (!torch.int, !torch.int) -> !torch.list<int>
%969 = torch.aten.view %966, %968 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %969, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
%970 = torch.aten.mm %969, %967 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,14336],f8E4M3FNUZ> -> !torch.vtensor<[?,14336],f8E4M3FNUZ>
torch.bind_symbolic_shape %970, [%548], affine_map<()[s0] -> (s0 * 32, 14336)> : !torch.vtensor<[?,14336],f8E4M3FNUZ>
%int1_506 = torch.constant.int 1
%int14336_507 = torch.constant.int 14336
%971 = torch.prim.ListConstruct %int1_506, %566, %int14336_507 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%972 = torch.aten.view %970, %971 : !torch.vtensor<[?,14336],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
torch.bind_symbolic_shape %972, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
%int15_508 = torch.constant.int 15
%973 = torch.prims.convert_element_type %972, %int15_508 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,14336],bf16>
torch.bind_symbolic_shape %973, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
%974 = torch.aten.silu %973 : !torch.vtensor<[1,?,14336],bf16> -> !torch.vtensor<[1,?,14336],bf16>
torch.bind_symbolic_shape %974, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
%975 = torch.aten.div.Tensor %963, %31 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %975, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%float-2.400000e02_509 = torch.constant.float -2.400000e+02
%float2.400000e02_510 = torch.constant.float 2.400000e+02
%976 = torch.aten.clamp %975, %float-2.400000e02_509, %float2.400000e02_510 : !torch.vtensor<[1,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %976, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%int26_511 = torch.constant.int 26
%977 = torch.prims.convert_element_type %976, %int26_511 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %977, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
%int-2_512 = torch.constant.int -2
%int-1_513 = torch.constant.int -1
%978 = torch.aten.transpose.int %32, %int-2_512, %int-1_513 : !torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
%int4096_514 = torch.constant.int 4096
%979 = torch.prim.ListConstruct %566, %int4096_514 : (!torch.int, !torch.int) -> !torch.list<int>
%980 = torch.aten.view %977, %979 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %980, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
%981 = torch.aten.mm %980, %978 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,14336],f8E4M3FNUZ> -> !torch.vtensor<[?,14336],f8E4M3FNUZ>
torch.bind_symbolic_shape %981, [%548], affine_map<()[s0] -> (s0 * 32, 14336)> : !torch.vtensor<[?,14336],f8E4M3FNUZ>
%int1_515 = torch.constant.int 1
%int14336_516 = torch.constant.int 14336
%982 = torch.prim.ListConstruct %int1_515, %566, %int14336_516 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%983 = torch.aten.view %981, %982 : !torch.vtensor<[?,14336],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
torch.bind_symbolic_shape %983, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
%int15_517 = torch.constant.int 15
%984 = torch.prims.convert_element_type %983, %int15_517 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,14336],bf16>
torch.bind_symbolic_shape %984, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
%985 = torch.aten.mul.Tensor %974, %984 : !torch.vtensor<[1,?,14336],bf16>, !torch.vtensor<[1,?,14336],bf16> -> !torch.vtensor<[1,?,14336],bf16>
torch.bind_symbolic_shape %985, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
%986 = torch.aten.div.Tensor %985, %33 : !torch.vtensor<[1,?,14336],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,14336],bf16>
torch.bind_symbolic_shape %986, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
%float-2.400000e02_518 = torch.constant.float -2.400000e+02
%float2.400000e02_519 = torch.constant.float 2.400000e+02
%987 = torch.aten.clamp %986, %float-2.400000e02_518, %float2.400000e02_519 : !torch.vtensor<[1,?,14336],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,14336],bf16>
torch.bind_symbolic_shape %987, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
%int26_520 = torch.constant.int 26
%988 = torch.prims.convert_element_type %987, %int26_520 : !torch.vtensor<[1,?,14336],bf16>, !torch.int -> !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
torch.bind_symbolic_shape %988, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
%int-2_521 = torch.constant.int -2
%int-1_522 = torch.constant.int -1
%989 = torch.aten.transpose.int %34, %int-2_521, %int-1_522 : !torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%int1_523 = torch.constant.int 1
%990 = torch.aten.size.int %972, %int1_523 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.int -> !torch.int
%int14336_524 = torch.constant.int 14336
%991 = torch.prim.ListConstruct %990, %int14336_524 : (!torch.int, !torch.int) -> !torch.list<int>
%992 = torch.aten.view %988, %991 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,14336],f8E4M3FNUZ>
torch.bind_symbolic_shape %992, [%548], affine_map<()[s0] -> (s0 * 32, 14336)> : !torch.vtensor<[?,14336],f8E4M3FNUZ>
%993 = torch.aten.mm %992, %989 : !torch.vtensor<[?,14336],f8E4M3FNUZ>, !torch.vtensor<[14336,4096],f8E4M3FNUZ> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %993, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
%int1_525 = torch.constant.int 1
%int4096_526 = torch.constant.int 4096
%994 = torch.prim.ListConstruct %int1_525, %990, %int4096_526 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%995 = torch.aten.view %993, %994 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %995, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
%int15_527 = torch.constant.int 15
%996 = torch.prims.convert_element_type %995, %int15_527 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
torch.bind_symbolic_shape %996, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
%int1_528 = torch.constant.int 1
%997 = torch.aten.add.Tensor %956, %996, %int1_528 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %997, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%int2_529 = torch.constant.int 2
%998 = torch.aten.pow.Tensor_Scalar %997, %int2_529 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %998, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%int-1_530 = torch.constant.int -1
%999 = torch.prim.ListConstruct %int-1_530 : (!torch.int) -> !torch.list<int>
%true_531 = torch.constant.bool true
%none_532 = torch.constant.none
%1000 = torch.aten.mean.dim %998, %999, %true_531, %none_532 : !torch.vtensor<[1,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,?,1],f32>
torch.bind_symbolic_shape %1000, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
%float1.000000e-05_533 = torch.constant.float 1.000000e-05
%int1_534 = torch.constant.int 1
%1001 = torch.aten.add.Scalar %1000, %float1.000000e-05_533, %int1_534 : !torch.vtensor<[1,?,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,?,1],f32>
torch.bind_symbolic_shape %1001, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
%1002 = torch.aten.rsqrt %1001 : !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,1],f32>
torch.bind_symbolic_shape %1002, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
%1003 = torch.aten.mul.Tensor %997, %1002 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %1003, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%1004 = torch.aten.mul.Tensor %35, %1003 : !torch.vtensor<[4096],bf16>, !torch.vtensor<[1,?,4096],f32> -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %1004, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%1005 = torch.aten.div.Tensor %1004, %36 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %1005, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%float-2.400000e02_535 = torch.constant.float -2.400000e+02
%float2.400000e02_536 = torch.constant.float 2.400000e+02
%1006 = torch.aten.clamp %1005, %float-2.400000e02_535, %float2.400000e02_536 : !torch.vtensor<[1,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %1006, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%int26_537 = torch.constant.int 26
%1007 = torch.prims.convert_element_type %1006, %int26_537 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %1007, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
%int-2_538 = torch.constant.int -2
%int-1_539 = torch.constant.int -1
%1008 = torch.aten.transpose.int %37, %int-2_538, %int-1_539 : !torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%int4096_540 = torch.constant.int 4096
%1009 = torch.prim.ListConstruct %566, %int4096_540 : (!torch.int, !torch.int) -> !torch.list<int>
%1010 = torch.aten.view %1007, %1009 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %1010, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
%1011 = torch.aten.mm %1010, %1008 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %1011, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
%int1_541 = torch.constant.int 1
%int4096_542 = torch.constant.int 4096
%1012 = torch.prim.ListConstruct %int1_541, %566, %int4096_542 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%1013 = torch.aten.view %1011, %1012 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %1013, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
%int15_543 = torch.constant.int 15
%1014 = torch.prims.convert_element_type %1013, %int15_543 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
torch.bind_symbolic_shape %1014, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
%1015 = torch.aten.div.Tensor %1004, %38 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %1015, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%float-2.400000e02_544 = torch.constant.float -2.400000e+02
%float2.400000e02_545 = torch.constant.float 2.400000e+02
%1016 = torch.aten.clamp %1015, %float-2.400000e02_544, %float2.400000e02_545 : !torch.vtensor<[1,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %1016, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%int26_546 = torch.constant.int 26
%1017 = torch.prims.convert_element_type %1016, %int26_546 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %1017, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
%int-2_547 = torch.constant.int -2
%int-1_548 = torch.constant.int -1
%1018 = torch.aten.transpose.int %39, %int-2_547, %int-1_548 : !torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,1024],f8E4M3FNUZ>
%int4096_549 = torch.constant.int 4096
%1019 = torch.prim.ListConstruct %566, %int4096_549 : (!torch.int, !torch.int) -> !torch.list<int>
%1020 = torch.aten.view %1017, %1019 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %1020, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
%1021 = torch.aten.mm %1020, %1018 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,1024],f8E4M3FNUZ> -> !torch.vtensor<[?,1024],f8E4M3FNUZ>
torch.bind_symbolic_shape %1021, [%548], affine_map<()[s0] -> (s0 * 32, 1024)> : !torch.vtensor<[?,1024],f8E4M3FNUZ>
%int1_550 = torch.constant.int 1
%int1024_551 = torch.constant.int 1024
%1022 = torch.prim.ListConstruct %int1_550, %566, %int1024_551 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%1023 = torch.aten.view %1021, %1022 : !torch.vtensor<[?,1024],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
torch.bind_symbolic_shape %1023, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
%int15_552 = torch.constant.int 15
%1024 = torch.prims.convert_element_type %1023, %int15_552 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,1024],bf16>
torch.bind_symbolic_shape %1024, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],bf16>
%1025 = torch.aten.div.Tensor %1004, %40 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %1025, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%float-2.400000e02_553 = torch.constant.float -2.400000e+02
%float2.400000e02_554 = torch.constant.float 2.400000e+02
%1026 = torch.aten.clamp %1025, %float-2.400000e02_553, %float2.400000e02_554 : !torch.vtensor<[1,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %1026, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%int26_555 = torch.constant.int 26
%1027 = torch.prims.convert_element_type %1026, %int26_555 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %1027, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
%int-2_556 = torch.constant.int -2
%int-1_557 = torch.constant.int -1
%1028 = torch.aten.transpose.int %41, %int-2_556, %int-1_557 : !torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,1024],f8E4M3FNUZ>
%int4096_558 = torch.constant.int 4096
%1029 = torch.prim.ListConstruct %566, %int4096_558 : (!torch.int, !torch.int) -> !torch.list<int>
%1030 = torch.aten.view %1027, %1029 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %1030, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
%1031 = torch.aten.mm %1030, %1028 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,1024],f8E4M3FNUZ> -> !torch.vtensor<[?,1024],f8E4M3FNUZ>
torch.bind_symbolic_shape %1031, [%548], affine_map<()[s0] -> (s0 * 32, 1024)> : !torch.vtensor<[?,1024],f8E4M3FNUZ>
%int1_559 = torch.constant.int 1
%int1024_560 = torch.constant.int 1024
%1032 = torch.prim.ListConstruct %int1_559, %566, %int1024_560 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%1033 = torch.aten.view %1031, %1032 : !torch.vtensor<[?,1024],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
torch.bind_symbolic_shape %1033, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
%int15_561 = torch.constant.int 15
%1034 = torch.prims.convert_element_type %1033, %int15_561 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,1024],bf16>
torch.bind_symbolic_shape %1034, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],bf16>
%int1_562 = torch.constant.int 1
%int32_563 = torch.constant.int 32
%int128_564 = torch.constant.int 128
%1035 = torch.prim.ListConstruct %int1_562, %566, %int32_563, %int128_564 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%1036 = torch.aten.view %1014, %1035 : !torch.vtensor<[1,?,4096],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,32,128],bf16>
torch.bind_symbolic_shape %1036, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
%int1_565 = torch.constant.int 1
%int8_566 = torch.constant.int 8
%int128_567 = torch.constant.int 128
%1037 = torch.prim.ListConstruct %int1_565, %566, %int8_566, %int128_567 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%1038 = torch.aten.view %1024, %1037 : !torch.vtensor<[1,?,1024],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,8,128],bf16>
torch.bind_symbolic_shape %1038, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
%int1_568 = torch.constant.int 1
%int8_569 = torch.constant.int 8
%int128_570 = torch.constant.int 128
%1039 = torch.prim.ListConstruct %int1_568, %566, %int8_569, %int128_570 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%1040 = torch.aten.view %1034, %1039 : !torch.vtensor<[1,?,1024],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,8,128],bf16>
torch.bind_symbolic_shape %1040, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
%int131072_571 = torch.constant.int 131072
%none_572 = torch.constant.none
%none_573 = torch.constant.none
%cpu_574 = torch.constant.device "cpu"
%false_575 = torch.constant.bool false
%1041 = torch.aten.arange %int131072_571, %none_572, %none_573, %cpu_574, %false_575 : !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[131072],si64>
%int0_576 = torch.constant.int 0
%int128_577 = torch.constant.int 128
%none_578 = torch.constant.none
%none_579 = torch.constant.none
%cpu_580 = torch.constant.device "cpu"
%false_581 = torch.constant.bool false
%1042 = torch.aten.arange.start %int0_576, %int128_577, %none_578, %none_579, %cpu_580, %false_581 : !torch.int, !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[128],si64>
%int2_582 = torch.constant.int 2
%1043 = torch.aten.floor_divide.Scalar %1042, %int2_582 : !torch.vtensor<[128],si64>, !torch.int -> !torch.vtensor<[128],si64>
%int6_583 = torch.constant.int 6
%1044 = torch.prims.convert_element_type %1043, %int6_583 : !torch.vtensor<[128],si64>, !torch.int -> !torch.vtensor<[128],f32>
%int128_584 = torch.constant.int 128
%1045 = torch.aten.div.Scalar %1044, %int128_584 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[128],f32>
%float2.000000e00_585 = torch.constant.float 2.000000e+00
%1046 = torch.aten.mul.Scalar %1045, %float2.000000e00_585 : !torch.vtensor<[128],f32>, !torch.float -> !torch.vtensor<[128],f32>
%float5.000000e05_586 = torch.constant.float 5.000000e+05
%1047 = torch.aten.pow.Scalar %float5.000000e05_586, %1046 : !torch.float, !torch.vtensor<[128],f32> -> !torch.vtensor<[128],f32>
%1048 = torch.aten.reciprocal %1047 : !torch.vtensor<[128],f32> -> !torch.vtensor<[128],f32>
%float1.000000e00_587 = torch.constant.float 1.000000e+00
%1049 = torch.aten.mul.Scalar %1048, %float1.000000e00_587 : !torch.vtensor<[128],f32>, !torch.float -> !torch.vtensor<[128],f32>
%int131072_588 = torch.constant.int 131072
%int1_589 = torch.constant.int 1
%1050 = torch.prim.ListConstruct %int131072_588, %int1_589 : (!torch.int, !torch.int) -> !torch.list<int>
%1051 = torch.aten.view %1041, %1050 : !torch.vtensor<[131072],si64>, !torch.list<int> -> !torch.vtensor<[131072,1],si64>
%1052 = torch.aten.mul.Tensor %1051, %1049 : !torch.vtensor<[131072,1],si64>, !torch.vtensor<[128],f32> -> !torch.vtensor<[131072,128],f32>
%int1_590 = torch.constant.int 1
%1053 = torch.aten.size.int %1013, %int1_590 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.int
%int0_591 = torch.constant.int 0
%1054 = torch.aten.add.int %int0_591, %1053 : !torch.int, !torch.int -> !torch.int
%int0_592 = torch.constant.int 0
%int0_593 = torch.constant.int 0
%int1_594 = torch.constant.int 1
%1055 = torch.aten.slice.Tensor %1052, %int0_592, %int0_593, %1054, %int1_594 : !torch.vtensor<[131072,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
torch.bind_symbolic_shape %1055, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
%int1_595 = torch.constant.int 1
%int0_596 = torch.constant.int 0
%int9223372036854775807_597 = torch.constant.int 9223372036854775807
%int1_598 = torch.constant.int 1
%1056 = torch.aten.slice.Tensor %1055, %int1_595, %int0_596, %int9223372036854775807_597, %int1_598 : !torch.vtensor<[?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
torch.bind_symbolic_shape %1056, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
%int1_599 = torch.constant.int 1
%int0_600 = torch.constant.int 0
%int9223372036854775807_601 = torch.constant.int 9223372036854775807
%int1_602 = torch.constant.int 1
%1057 = torch.aten.slice.Tensor %1056, %int1_599, %int0_600, %int9223372036854775807_601, %int1_602 : !torch.vtensor<[?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
torch.bind_symbolic_shape %1057, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
%int0_603 = torch.constant.int 0
%1058 = torch.aten.unsqueeze %1057, %int0_603 : !torch.vtensor<[?,128],f32>, !torch.int -> !torch.vtensor<[1,?,128],f32>
torch.bind_symbolic_shape %1058, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
%int1_604 = torch.constant.int 1
%int0_605 = torch.constant.int 0
%int9223372036854775807_606 = torch.constant.int 9223372036854775807
%int1_607 = torch.constant.int 1
%1059 = torch.aten.slice.Tensor %1058, %int1_604, %int0_605, %int9223372036854775807_606, %int1_607 : !torch.vtensor<[1,?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],f32>
torch.bind_symbolic_shape %1059, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
%int2_608 = torch.constant.int 2
%int0_609 = torch.constant.int 0
%int9223372036854775807_610 = torch.constant.int 9223372036854775807
%int1_611 = torch.constant.int 1
%1060 = torch.aten.slice.Tensor %1059, %int2_608, %int0_609, %int9223372036854775807_610, %int1_611 : !torch.vtensor<[1,?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],f32>
torch.bind_symbolic_shape %1060, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
%int1_612 = torch.constant.int 1
%int1_613 = torch.constant.int 1
%int1_614 = torch.constant.int 1
%1061 = torch.prim.ListConstruct %int1_612, %int1_613, %int1_614 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%1062 = torch.aten.repeat %1060, %1061 : !torch.vtensor<[1,?,128],f32>, !torch.list<int> -> !torch.vtensor<[1,?,128],f32>
torch.bind_symbolic_shape %1062, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
%int6_615 = torch.constant.int 6
%1063 = torch.prims.convert_element_type %1036, %int6_615 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int -> !torch.vtensor<[1,?,32,128],f32>
torch.bind_symbolic_shape %1063, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
%1064 = torch_c.to_builtin_tensor %1063 : !torch.vtensor<[1,?,32,128],f32> -> tensor<1x?x32x128xf32>
%1065 = torch_c.to_builtin_tensor %1062 : !torch.vtensor<[1,?,128],f32> -> tensor<1x?x128xf32>
%1066 = util.call @sharktank_rotary_embedding_1_D_32_128_f32(%1064, %1065) : (tensor<1x?x32x128xf32>, tensor<1x?x128xf32>) -> tensor<1x?x32x128xf32>
%1067 = torch_c.from_builtin_tensor %1066 : tensor<1x?x32x128xf32> -> !torch.vtensor<[1,?,32,128],f32>
torch.bind_symbolic_shape %1067, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
%int15_616 = torch.constant.int 15
%1068 = torch.prims.convert_element_type %1067, %int15_616 : !torch.vtensor<[1,?,32,128],f32>, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
torch.bind_symbolic_shape %1068, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
%int131072_617 = torch.constant.int 131072
%none_618 = torch.constant.none
%none_619 = torch.constant.none
%cpu_620 = torch.constant.device "cpu"
%false_621 = torch.constant.bool false
%1069 = torch.aten.arange %int131072_617, %none_618, %none_619, %cpu_620, %false_621 : !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[131072],si64>
%int0_622 = torch.constant.int 0
%int128_623 = torch.constant.int 128
%none_624 = torch.constant.none
%none_625 = torch.constant.none
%cpu_626 = torch.constant.device "cpu"
%false_627 = torch.constant.bool false
%1070 = torch.aten.arange.start %int0_622, %int128_623, %none_624, %none_625, %cpu_626, %false_627 : !torch.int, !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[128],si64>
%int2_628 = torch.constant.int 2
%1071 = torch.aten.floor_divide.Scalar %1070, %int2_628 : !torch.vtensor<[128],si64>, !torch.int -> !torch.vtensor<[128],si64>
%int6_629 = torch.constant.int 6
%1072 = torch.prims.convert_element_type %1071, %int6_629 : !torch.vtensor<[128],si64>, !torch.int -> !torch.vtensor<[128],f32>
%int128_630 = torch.constant.int 128
%1073 = torch.aten.div.Scalar %1072, %int128_630 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[128],f32>
%float2.000000e00_631 = torch.constant.float 2.000000e+00
%1074 = torch.aten.mul.Scalar %1073, %float2.000000e00_631 : !torch.vtensor<[128],f32>, !torch.float -> !torch.vtensor<[128],f32>
%float5.000000e05_632 = torch.constant.float 5.000000e+05
%1075 = torch.aten.pow.Scalar %float5.000000e05_632, %1074 : !torch.float, !torch.vtensor<[128],f32> -> !torch.vtensor<[128],f32>
%1076 = torch.aten.reciprocal %1075 : !torch.vtensor<[128],f32> -> !torch.vtensor<[128],f32>
%float1.000000e00_633 = torch.constant.float 1.000000e+00
%1077 = torch.aten.mul.Scalar %1076, %float1.000000e00_633 : !torch.vtensor<[128],f32>, !torch.float -> !torch.vtensor<[128],f32>
%int131072_634 = torch.constant.int 131072
%int1_635 = torch.constant.int 1
%1078 = torch.prim.ListConstruct %int131072_634, %int1_635 : (!torch.int, !torch.int) -> !torch.list<int>
%1079 = torch.aten.view %1069, %1078 : !torch.vtensor<[131072],si64>, !torch.list<int> -> !torch.vtensor<[131072,1],si64>
%1080 = torch.aten.mul.Tensor %1079, %1077 : !torch.vtensor<[131072,1],si64>, !torch.vtensor<[128],f32> -> !torch.vtensor<[131072,128],f32>
%int1_636 = torch.constant.int 1
%1081 = torch.aten.size.int %1023, %int1_636 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.int
%int0_637 = torch.constant.int 0
%1082 = torch.aten.add.int %int0_637, %1081 : !torch.int, !torch.int -> !torch.int
%int0_638 = torch.constant.int 0
%int0_639 = torch.constant.int 0
%int1_640 = torch.constant.int 1
%1083 = torch.aten.slice.Tensor %1080, %int0_638, %int0_639, %1082, %int1_640 : !torch.vtensor<[131072,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
torch.bind_symbolic_shape %1083, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
%int1_641 = torch.constant.int 1
%int0_642 = torch.constant.int 0
%int9223372036854775807_643 = torch.constant.int 9223372036854775807
%int1_644 = torch.constant.int 1
%1084 = torch.aten.slice.Tensor %1083, %int1_641, %int0_642, %int9223372036854775807_643, %int1_644 : !torch.vtensor<[?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
torch.bind_symbolic_shape %1084, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
%int1_645 = torch.constant.int 1
%int0_646 = torch.constant.int 0
%int9223372036854775807_647 = torch.constant.int 9223372036854775807
%int1_648 = torch.constant.int 1
%1085 = torch.aten.slice.Tensor %1084, %int1_645, %int0_646, %int9223372036854775807_647, %int1_648 : !torch.vtensor<[?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
torch.bind_symbolic_shape %1085, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
%int0_649 = torch.constant.int 0
%1086 = torch.aten.unsqueeze %1085, %int0_649 : !torch.vtensor<[?,128],f32>, !torch.int -> !torch.vtensor<[1,?,128],f32>
torch.bind_symbolic_shape %1086, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
%int1_650 = torch.constant.int 1
%int0_651 = torch.constant.int 0
%int9223372036854775807_652 = torch.constant.int 9223372036854775807
%int1_653 = torch.constant.int 1
%1087 = torch.aten.slice.Tensor %1086, %int1_650, %int0_651, %int9223372036854775807_652, %int1_653 : !torch.vtensor<[1,?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],f32>
torch.bind_symbolic_shape %1087, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
%int2_654 = torch.constant.int 2
%int0_655 = torch.constant.int 0
%int9223372036854775807_656 = torch.constant.int 9223372036854775807
%int1_657 = torch.constant.int 1
%1088 = torch.aten.slice.Tensor %1087, %int2_654, %int0_655, %int9223372036854775807_656, %int1_657 : !torch.vtensor<[1,?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],f32>
torch.bind_symbolic_shape %1088, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
%int1_658 = torch.constant.int 1
%int1_659 = torch.constant.int 1
%int1_660 = torch.constant.int 1
%1089 = torch.prim.ListConstruct %int1_658, %int1_659, %int1_660 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%1090 = torch.aten.repeat %1088, %1089 : !torch.vtensor<[1,?,128],f32>, !torch.list<int> -> !torch.vtensor<[1,?,128],f32>
torch.bind_symbolic_shape %1090, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
%int6_661 = torch.constant.int 6
%1091 = torch.prims.convert_element_type %1038, %int6_661 : !torch.vtensor<[1,?,8,128],bf16>, !torch.int -> !torch.vtensor<[1,?,8,128],f32>
torch.bind_symbolic_shape %1091, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f32>
%1092 = torch_c.to_builtin_tensor %1091 : !torch.vtensor<[1,?,8,128],f32> -> tensor<1x?x8x128xf32>
%1093 = torch_c.to_builtin_tensor %1090 : !torch.vtensor<[1,?,128],f32> -> tensor<1x?x128xf32>
%1094 = util.call @sharktank_rotary_embedding_1_D_8_128_f32(%1092, %1093) : (tensor<1x?x8x128xf32>, tensor<1x?x128xf32>) -> tensor<1x?x8x128xf32>
%1095 = torch_c.from_builtin_tensor %1094 : tensor<1x?x8x128xf32> -> !torch.vtensor<[1,?,8,128],f32>
torch.bind_symbolic_shape %1095, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f32>
%int15_662 = torch.constant.int 15
%1096 = torch.prims.convert_element_type %1095, %int15_662 : !torch.vtensor<[1,?,8,128],f32>, !torch.int -> !torch.vtensor<[1,?,8,128],bf16>
torch.bind_symbolic_shape %1096, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
%1097 = torch.aten.div.Tensor %1096, %42 : !torch.vtensor<[1,?,8,128],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,8,128],bf16>
torch.bind_symbolic_shape %1097, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
%float-2.400000e02_663 = torch.constant.float -2.400000e+02
%float2.400000e02_664 = torch.constant.float 2.400000e+02
%1098 = torch.aten.clamp %1097, %float-2.400000e02_663, %float2.400000e02_664 : !torch.vtensor<[1,?,8,128],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,8,128],bf16>
torch.bind_symbolic_shape %1098, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
%int26_665 = torch.constant.int 26
%1099 = torch.prims.convert_element_type %1098, %int26_665 : !torch.vtensor<[1,?,8,128],bf16>, !torch.int -> !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %1099, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
%1100 = torch.aten.div.Tensor %1040, %42 : !torch.vtensor<[1,?,8,128],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,8,128],bf16>
torch.bind_symbolic_shape %1100, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
%float-2.400000e02_666 = torch.constant.float -2.400000e+02
%float2.400000e02_667 = torch.constant.float 2.400000e+02
%1101 = torch.aten.clamp %1100, %float-2.400000e02_666, %float2.400000e02_667 : !torch.vtensor<[1,?,8,128],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,8,128],bf16>
torch.bind_symbolic_shape %1101, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
%int26_668 = torch.constant.int 26
%1102 = torch.prims.convert_element_type %1101, %int26_668 : !torch.vtensor<[1,?,8,128],bf16>, !torch.int -> !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %1102, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
%int64_669 = torch.constant.int 64
%1103 = torch.aten.mul.Scalar %arg2, %int64_669 : !torch.vtensor<[1,?],si64>, !torch.int -> !torch.vtensor<[1,?],si64>
torch.bind_symbolic_shape %1103, [%548], affine_map<()[s0] -> (1, s0)> : !torch.vtensor<[1,?],si64>
%int4_670 = torch.constant.int 4
%int1_671 = torch.constant.int 1
%1104 = torch.aten.add.Scalar %1103, %int4_670, %int1_671 : !torch.vtensor<[1,?],si64>, !torch.int, !torch.int -> !torch.vtensor<[1,?],si64>
torch.bind_symbolic_shape %1104, [%548], affine_map<()[s0] -> (1, s0)> : !torch.vtensor<[1,?],si64>
%int1_672 = torch.constant.int 1
%int32_673 = torch.constant.int 32
%int8_674 = torch.constant.int 8
%int128_675 = torch.constant.int 128
%1105 = torch.prim.ListConstruct %int1_672, %670, %int32_673, %int8_674, %int128_675 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%1106 = torch.aten.view %1099, %1105 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %1106, [%548], affine_map<()[s0] -> (1, s0, 32, 8, 128)> : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
%int32_676 = torch.constant.int 32
%int8_677 = torch.constant.int 8
%int128_678 = torch.constant.int 128
%1107 = torch.prim.ListConstruct %670, %int32_676, %int8_677, %int128_678 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%1108 = torch.aten.view %1106, %1107 : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %1108, [%548], affine_map<()[s0] -> (s0, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
%1109 = torch.prim.ListConstruct %670 : (!torch.int) -> !torch.list<int>
%1110 = torch.aten.view %1104, %1109 : !torch.vtensor<[1,?],si64>, !torch.list<int> -> !torch.vtensor<[?],si64>
torch.bind_symbolic_shape %1110, [%548], affine_map<()[s0] -> (s0)> : !torch.vtensor<[?],si64>
%int32_679 = torch.constant.int 32
%int2_680 = torch.constant.int 2
%int32_681 = torch.constant.int 32
%int8_682 = torch.constant.int 8
%int128_683 = torch.constant.int 128
%1111 = torch.prim.ListConstruct %661, %int32_679, %int2_680, %int32_681, %int8_682, %int128_683 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%1112 = torch.aten.view %919, %1111 : !torch.vtensor<[?,2097152],f16>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f16>
torch.bind_symbolic_shape %1112, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f16>
%int32_684 = torch.constant.int 32
%1113 = torch.aten.mul.int %661, %int32_684 : !torch.int, !torch.int -> !torch.int
%int2_685 = torch.constant.int 2
%1114 = torch.aten.mul.int %1113, %int2_685 : !torch.int, !torch.int -> !torch.int
%int32_686 = torch.constant.int 32
%int8_687 = torch.constant.int 8
%int128_688 = torch.constant.int 128
%1115 = torch.prim.ListConstruct %1114, %int32_686, %int8_687, %int128_688 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%1116 = torch.aten.view %1112, %1115 : !torch.vtensor<[?,32,2,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f16>
torch.bind_symbolic_shape %1116, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f16>
%1117 = torch.prim.ListConstruct %1110 : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
%false_689 = torch.constant.bool false
%1118 = torch.aten.index_put %1116, %1117, %1108, %false_689 : !torch.vtensor<[?,32,8,128],f16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool -> !torch.vtensor<[?,32,8,128],f16>
torch.bind_symbolic_shape %1118, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f16>
%int32_690 = torch.constant.int 32
%int2_691 = torch.constant.int 2
%int32_692 = torch.constant.int 32
%int8_693 = torch.constant.int 8
%int128_694 = torch.constant.int 128
%1119 = torch.prim.ListConstruct %661, %int32_690, %int2_691, %int32_692, %int8_693, %int128_694 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%1120 = torch.aten.view %1118, %1119 : !torch.vtensor<[?,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f16>
torch.bind_symbolic_shape %1120, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f16>
%int2097152_695 = torch.constant.int 2097152
%1121 = torch.prim.ListConstruct %661, %int2097152_695 : (!torch.int, !torch.int) -> !torch.list<int>
%1122 = torch.aten.view %1120, %1121 : !torch.vtensor<[?,32,2,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,2097152],f16>
torch.bind_symbolic_shape %1122, [%549], affine_map<()[s0] -> (s0, 2097152)> : !torch.vtensor<[?,2097152],f16>
%int32_696 = torch.constant.int 32
%int2_697 = torch.constant.int 2
%int32_698 = torch.constant.int 32
%int8_699 = torch.constant.int 8
%int128_700 = torch.constant.int 128
%1123 = torch.prim.ListConstruct %661, %int32_696, %int2_697, %int32_698, %int8_699, %int128_700 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%1124 = torch.aten.view %1122, %1123 : !torch.vtensor<[?,2097152],f16>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f16>
torch.bind_symbolic_shape %1124, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f16>
%int32_701 = torch.constant.int 32
%int8_702 = torch.constant.int 8
%int128_703 = torch.constant.int 128
%1125 = torch.prim.ListConstruct %1114, %int32_701, %int8_702, %int128_703 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%1126 = torch.aten.view %1124, %1125 : !torch.vtensor<[?,32,2,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f16>
torch.bind_symbolic_shape %1126, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f16>
%int1_704 = torch.constant.int 1
%int32_705 = torch.constant.int 32
%int8_706 = torch.constant.int 8
%int128_707 = torch.constant.int 128
%1127 = torch.prim.ListConstruct %int1_704, %670, %int32_705, %int8_706, %int128_707 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%1128 = torch.aten.view %1102, %1127 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %1128, [%548], affine_map<()[s0] -> (1, s0, 32, 8, 128)> : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
%int32_708 = torch.constant.int 32
%int8_709 = torch.constant.int 8
%int128_710 = torch.constant.int 128
%1129 = torch.prim.ListConstruct %670, %int32_708, %int8_709, %int128_710 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%1130 = torch.aten.view %1128, %1129 : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %1130, [%548], affine_map<()[s0] -> (s0, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
%int1_711 = torch.constant.int 1
%int1_712 = torch.constant.int 1
%1131 = torch.aten.add.Scalar %1104, %int1_711, %int1_712 : !torch.vtensor<[1,?],si64>, !torch.int, !torch.int -> !torch.vtensor<[1,?],si64>
torch.bind_symbolic_shape %1131, [%548], affine_map<()[s0] -> (1, s0)> : !torch.vtensor<[1,?],si64>
%1132 = torch.prim.ListConstruct %670 : (!torch.int) -> !torch.list<int>
%1133 = torch.aten.view %1131, %1132 : !torch.vtensor<[1,?],si64>, !torch.list<int> -> !torch.vtensor<[?],si64>
torch.bind_symbolic_shape %1133, [%548], affine_map<()[s0] -> (s0)> : !torch.vtensor<[?],si64>
%1134 = torch.prim.ListConstruct %1133 : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
%false_713 = torch.constant.bool false
%1135 = torch.aten.index_put %1126, %1134, %1130, %false_713 : !torch.vtensor<[?,32,8,128],f16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool -> !torch.vtensor<[?,32,8,128],f16>
torch.bind_symbolic_shape %1135, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f16>
%int32_714 = torch.constant.int 32
%int2_715 = torch.constant.int 2
%int32_716 = torch.constant.int 32
%int8_717 = torch.constant.int 8
%int128_718 = torch.constant.int 128
%1136 = torch.prim.ListConstruct %661, %int32_714, %int2_715, %int32_716, %int8_717, %int128_718 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%1137 = torch.aten.view %1135, %1136 : !torch.vtensor<[?,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f16>
torch.bind_symbolic_shape %1137, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f16>
%int2097152_719 = torch.constant.int 2097152
%1138 = torch.prim.ListConstruct %661, %int2097152_719 : (!torch.int, !torch.int) -> !torch.list<int>
%1139 = torch.aten.view %1137, %1138 : !torch.vtensor<[?,32,2,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,2097152],f16>
torch.bind_symbolic_shape %1139, [%549], affine_map<()[s0] -> (s0, 2097152)> : !torch.vtensor<[?,2097152],f16>
%int-2_720 = torch.constant.int -2
%1140 = torch.aten.unsqueeze %1099, %int-2_720 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %1140, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 1, 128)> : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
%int1_721 = torch.constant.int 1
%int8_722 = torch.constant.int 8
%int4_723 = torch.constant.int 4
%int128_724 = torch.constant.int 128
%1141 = torch.prim.ListConstruct %int1_721, %1081, %int8_722, %int4_723, %int128_724 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%false_725 = torch.constant.bool false
%1142 = torch.aten.expand %1140, %1141, %false_725 : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %1142, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
%int0_726 = torch.constant.int 0
%1143 = torch.aten.clone %1142, %int0_726 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %1143, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
%int1_727 = torch.constant.int 1
%int32_728 = torch.constant.int 32
%int128_729 = torch.constant.int 128
%1144 = torch.prim.ListConstruct %int1_727, %1081, %int32_728, %int128_729 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%1145 = torch.aten._unsafe_view %1143, %1144 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %1145, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
%int-2_730 = torch.constant.int -2
%1146 = torch.aten.unsqueeze %1102, %int-2_730 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %1146, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 1, 128)> : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
%int1_731 = torch.constant.int 1
%1147 = torch.aten.size.int %1033, %int1_731 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.int
%int1_732 = torch.constant.int 1
%int8_733 = torch.constant.int 8
%int4_734 = torch.constant.int 4
%int128_735 = torch.constant.int 128
%1148 = torch.prim.ListConstruct %int1_732, %1147, %int8_733, %int4_734, %int128_735 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%false_736 = torch.constant.bool false
%1149 = torch.aten.expand %1146, %1148, %false_736 : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %1149, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
%int0_737 = torch.constant.int 0
%1150 = torch.aten.clone %1149, %int0_737 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %1150, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
%int1_738 = torch.constant.int 1
%int32_739 = torch.constant.int 32
%int128_740 = torch.constant.int 128
%1151 = torch.prim.ListConstruct %int1_738, %1147, %int32_739, %int128_740 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%1152 = torch.aten._unsafe_view %1150, %1151 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %1152, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
%int6_741 = torch.constant.int 6
%1153 = torch.prims.convert_element_type %1145, %int6_741 : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,32,128],f32>
torch.bind_symbolic_shape %1153, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
%1154 = torch.aten.mul.Tensor %1153, %42 : !torch.vtensor<[1,?,32,128],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,32,128],f32>
torch.bind_symbolic_shape %1154, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
%int15_742 = torch.constant.int 15
%1155 = torch.prims.convert_element_type %1154, %int15_742 : !torch.vtensor<[1,?,32,128],f32>, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
torch.bind_symbolic_shape %1155, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
%int6_743 = torch.constant.int 6
%1156 = torch.prims.convert_element_type %1152, %int6_743 : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,32,128],f32>
torch.bind_symbolic_shape %1156, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
%1157 = torch.aten.mul.Tensor %1156, %42 : !torch.vtensor<[1,?,32,128],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,32,128],f32>
torch.bind_symbolic_shape %1157, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
%int15_744 = torch.constant.int 15
%1158 = torch.prims.convert_element_type %1157, %int15_744 : !torch.vtensor<[1,?,32,128],f32>, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
torch.bind_symbolic_shape %1158, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
%int1_745 = torch.constant.int 1
%int2_746 = torch.constant.int 2
%1159 = torch.aten.transpose.int %1068, %int1_745, %int2_746 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
torch.bind_symbolic_shape %1159, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
%int1_747 = torch.constant.int 1
%int2_748 = torch.constant.int 2
%1160 = torch.aten.transpose.int %1155, %int1_747, %int2_748 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
torch.bind_symbolic_shape %1160, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
%int1_749 = torch.constant.int 1
%int2_750 = torch.constant.int 2
%1161 = torch.aten.transpose.int %1158, %int1_749, %int2_750 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
torch.bind_symbolic_shape %1161, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
%float0.000000e00_751 = torch.constant.float 0.000000e+00
%true_752 = torch.constant.bool true
%none_753 = torch.constant.none
%none_754 = torch.constant.none
%1162:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%1159, %1160, %1161, %float0.000000e00_751, %true_752, %none_753, %none_754) : (!torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,32,?,128],bf16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,32,?],f32>)
torch.bind_symbolic_shape %1162#0, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
%int1_755 = torch.constant.int 1
%int2_756 = torch.constant.int 2
%1163 = torch.aten.transpose.int %1162#0, %int1_755, %int2_756 : !torch.vtensor<[1,32,?,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
torch.bind_symbolic_shape %1163, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
%int1_757 = torch.constant.int 1
%int4096_758 = torch.constant.int 4096
%1164 = torch.prim.ListConstruct %int1_757, %1053, %int4096_758 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%1165 = torch.aten.view %1163, %1164 : !torch.vtensor<[1,?,32,128],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,4096],bf16>
torch.bind_symbolic_shape %1165, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
%1166 = torch.aten.div.Tensor %1165, %43 : !torch.vtensor<[1,?,4096],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],bf16>
torch.bind_symbolic_shape %1166, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
%float-2.400000e02_759 = torch.constant.float -2.400000e+02
%float2.400000e02_760 = torch.constant.float 2.400000e+02
%1167 = torch.aten.clamp %1166, %float-2.400000e02_759, %float2.400000e02_760 : !torch.vtensor<[1,?,4096],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],bf16>
torch.bind_symbolic_shape %1167, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
%int26_761 = torch.constant.int 26
%1168 = torch.prims.convert_element_type %1167, %int26_761 : !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %1168, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
%int-2_762 = torch.constant.int -2
%int-1_763 = torch.constant.int -1
%1169 = torch.aten.transpose.int %44, %int-2_762, %int-1_763 : !torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%int4096_764 = torch.constant.int 4096
%1170 = torch.prim.ListConstruct %1053, %int4096_764 : (!torch.int, !torch.int) -> !torch.list<int>
%1171 = torch.aten.view %1168, %1170 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %1171, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
%1172 = torch.aten.mm %1171, %1169 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %1172, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
%int1_765 = torch.constant.int 1
%int4096_766 = torch.constant.int 4096
%1173 = torch.prim.ListConstruct %int1_765, %1053, %int4096_766 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%1174 = torch.aten.view %1172, %1173 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %1174, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
%int15_767 = torch.constant.int 15
%1175 = torch.prims.convert_element_type %1174, %int15_767 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
torch.bind_symbolic_shape %1175, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
%int1_768 = torch.constant.int 1
%1176 = torch.aten.add.Tensor %997, %1175, %int1_768 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %1176, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%int2_769 = torch.constant.int 2
%1177 = torch.aten.pow.Tensor_Scalar %1176, %int2_769 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %1177, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%int-1_770 = torch.constant.int -1
%1178 = torch.prim.ListConstruct %int-1_770 : (!torch.int) -> !torch.list<int>
%true_771 = torch.constant.bool true
%none_772 = torch.constant.none
%1179 = torch.aten.mean.dim %1177, %1178, %true_771, %none_772 : !torch.vtensor<[1,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,?,1],f32>
torch.bind_symbolic_shape %1179, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
%float1.000000e-05_773 = torch.constant.float 1.000000e-05
%int1_774 = torch.constant.int 1
%1180 = torch.aten.add.Scalar %1179, %float1.000000e-05_773, %int1_774 : !torch.vtensor<[1,?,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,?,1],f32>
torch.bind_symbolic_shape %1180, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
%1181 = torch.aten.rsqrt %1180 : !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,1],f32>
torch.bind_symbolic_shape %1181, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
%1182 = torch.aten.mul.Tensor %1176, %1181 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %1182, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%1183 = torch.aten.mul.Tensor %45, %1182 : !torch.vtensor<[4096],bf16>, !torch.vtensor<[1,?,4096],f32> -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %1183, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%1184 = torch.aten.div.Tensor %1183, %46 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %1184, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%float-2.400000e02_775 = torch.constant.float -2.400000e+02
%float2.400000e02_776 = torch.constant.float 2.400000e+02
%1185 = torch.aten.clamp %1184, %float-2.400000e02_775, %float2.400000e02_776 : !torch.vtensor<[1,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %1185, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%int26_777 = torch.constant.int 26
%1186 = torch.prims.convert_element_type %1185, %int26_777 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %1186, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
%int-2_778 = torch.constant.int -2
%int-1_779 = torch.constant.int -1
%1187 = torch.aten.transpose.int %47, %int-2_778, %int-1_779 : !torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
%int4096_780 = torch.constant.int 4096
%1188 = torch.prim.ListConstruct %566, %int4096_780 : (!torch.int, !torch.int) -> !torch.list<int>
%1189 = torch.aten.view %1186, %1188 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %1189, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
%1190 = torch.aten.mm %1189, %1187 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,14336],f8E4M3FNUZ> -> !torch.vtensor<[?,14336],f8E4M3FNUZ>
torch.bind_symbolic_shape %1190, [%548], affine_map<()[s0] -> (s0 * 32, 14336)> : !torch.vtensor<[?,14336],f8E4M3FNUZ>
%int1_781 = torch.constant.int 1
%int14336_782 = torch.constant.int 14336
%1191 = torch.prim.ListConstruct %int1_781, %566, %int14336_782 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%1192 = torch.aten.view %1190, %1191 : !torch.vtensor<[?,14336],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
torch.bind_symbolic_shape %1192, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
%int15_783 = torch.constant.int 15
%1193 = torch.prims.convert_element_type %1192, %int15_783 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,14336],bf16>
torch.bind_symbolic_shape %1193, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
%1194 = torch.aten.silu %1193 : !torch.vtensor<[1,?,14336],bf16> -> !torch.vtensor<[1,?,14336],bf16>
torch.bind_symbolic_shape %1194, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
%1195 = torch.aten.div.Tensor %1183, %48 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %1195, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%float-2.400000e02_784 = torch.constant.float -2.400000e+02
%float2.400000e02_785 = torch.constant.float 2.400000e+02
%1196 = torch.aten.clamp %1195, %float-2.400000e02_784, %float2.400000e02_785 : !torch.vtensor<[1,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %1196, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%int26_786 = torch.constant.int 26
%1197 = torch.prims.convert_element_type %1196, %int26_786 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %1197, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
%int-2_787 = torch.constant.int -2
%int-1_788 = torch.constant.int -1
%1198 = torch.aten.transpose.int %49, %int-2_787, %int-1_788 : !torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
%int4096_789 = torch.constant.int 4096
%1199 = torch.prim.ListConstruct %566, %int4096_789 : (!torch.int, !torch.int) -> !torch.list<int>
%1200 = torch.aten.view %1197, %1199 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %1200, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
%1201 = torch.aten.mm %1200, %1198 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,14336],f8E4M3FNUZ> -> !torch.vtensor<[?,14336],f8E4M3FNUZ>
torch.bind_symbolic_shape %1201, [%548], affine_map<()[s0] -> (s0 * 32, 14336)> : !torch.vtensor<[?,14336],f8E4M3FNUZ>
%int1_790 = torch.constant.int 1
%int14336_791 = torch.constant.int 14336
%1202 = torch.prim.ListConstruct %int1_790, %566, %int14336_791 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%1203 = torch.aten.view %1201, %1202 : !torch.vtensor<[?,14336],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
torch.bind_symbolic_shape %1203, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
%int15_792 = torch.constant.int 15
%1204 = torch.prims.convert_element_type %1203, %int15_792 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,14336],bf16>
torch.bind_symbolic_shape %1204, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
%1205 = torch.aten.mul.Tensor %1194, %1204 : !torch.vtensor<[1,?,14336],bf16>, !torch.vtensor<[1,?,14336],bf16> -> !torch.vtensor<[1,?,14336],bf16>
torch.bind_symbolic_shape %1205, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
%1206 = torch.aten.div.Tensor %1205, %50 : !torch.vtensor<[1,?,14336],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,14336],bf16>
torch.bind_symbolic_shape %1206, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
%float-2.400000e02_793 = torch.constant.float -2.400000e+02
%float2.400000e02_794 = torch.constant.float 2.400000e+02
%1207 = torch.aten.clamp %1206, %float-2.400000e02_793, %float2.400000e02_794 : !torch.vtensor<[1,?,14336],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,14336],bf16>
torch.bind_symbolic_shape %1207, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
%int26_795 = torch.constant.int 26
%1208 = torch.prims.convert_element_type %1207, %int26_795 : !torch.vtensor<[1,?,14336],bf16>, !torch.int -> !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
torch.bind_symbolic_shape %1208, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
%int-2_796 = torch.constant.int -2
%int-1_797 = torch.constant.int -1
%1209 = torch.aten.transpose.int %51, %int-2_796, %int-1_797 : !torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%int1_798 = torch.constant.int 1
%1210 = torch.aten.size.int %1192, %int1_798 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.int -> !torch.int
%int14336_799 = torch.constant.int 14336
%1211 = torch.prim.ListConstruct %1210, %int14336_799 : (!torch.int, !torch.int) -> !torch.list<int>
%1212 = torch.aten.view %1208, %1211 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,14336],f8E4M3FNUZ>
torch.bind_symbolic_shape %1212, [%548], affine_map<()[s0] -> (s0 * 32, 14336)> : !torch.vtensor<[?,14336],f8E4M3FNUZ>
%1213 = torch.aten.mm %1212, %1209 : !torch.vtensor<[?,14336],f8E4M3FNUZ>, !torch.vtensor<[14336,4096],f8E4M3FNUZ> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %1213, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
%int1_800 = torch.constant.int 1
%int4096_801 = torch.constant.int 4096
%1214 = torch.prim.ListConstruct %int1_800, %1210, %int4096_801 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%1215 = torch.aten.view %1213, %1214 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %1215, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
%int15_802 = torch.constant.int 15
%1216 = torch.prims.convert_element_type %1215, %int15_802 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
torch.bind_symbolic_shape %1216, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
%int1_803 = torch.constant.int 1
%1217 = torch.aten.add.Tensor %1176, %1216, %int1_803 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %1217, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%int2_804 = torch.constant.int 2
%1218 = torch.aten.pow.Tensor_Scalar %1217, %int2_804 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %1218, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%int-1_805 = torch.constant.int -1
%1219 = torch.prim.ListConstruct %int-1_805 : (!torch.int) -> !torch.list<int>
%true_806 = torch.constant.bool true
%none_807 = torch.constant.none
%1220 = torch.aten.mean.dim %1218, %1219, %true_806, %none_807 : !torch.vtensor<[1,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,?,1],f32>
torch.bind_symbolic_shape %1220, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
%float1.000000e-05_808 = torch.constant.float 1.000000e-05
%int1_809 = torch.constant.int 1
%1221 = torch.aten.add.Scalar %1220, %float1.000000e-05_808, %int1_809 : !torch.vtensor<[1,?,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,?,1],f32>
torch.bind_symbolic_shape %1221, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
%1222 = torch.aten.rsqrt %1221 : !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,1],f32>
torch.bind_symbolic_shape %1222, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
%1223 = torch.aten.mul.Tensor %1217, %1222 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %1223, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%1224 = torch.aten.mul.Tensor %52, %1223 : !torch.vtensor<[4096],bf16>, !torch.vtensor<[1,?,4096],f32> -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %1224, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%1225 = torch.aten.div.Tensor %1224, %53 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %1225, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%float-2.400000e02_810 = torch.constant.float -2.400000e+02
%float2.400000e02_811 = torch.constant.float 2.400000e+02
%1226 = torch.aten.clamp %1225, %float-2.400000e02_810, %float2.400000e02_811 : !torch.vtensor<[1,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %1226, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%int26_812 = torch.constant.int 26
%1227 = torch.prims.convert_element_type %1226, %int26_812 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %1227, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
%int-2_813 = torch.constant.int -2
%int-1_814 = torch.constant.int -1
%1228 = torch.aten.transpose.int %54, %int-2_813, %int-1_814 : !torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%int4096_815 = torch.constant.int 4096
%1229 = torch.prim.ListConstruct %566, %int4096_815 : (!torch.int, !torch.int) -> !torch.list<int>
%1230 = torch.aten.view %1227, %1229 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %1230, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
%1231 = torch.aten.mm %1230, %1228 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %1231, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
%int1_816 = torch.constant.int 1
%int4096_817 = torch.constant.int 4096
%1232 = torch.prim.ListConstruct %int1_816, %566, %int4096_817 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%1233 = torch.aten.view %1231, %1232 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %1233, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
%int15_818 = torch.constant.int 15
%1234 = torch.prims.convert_element_type %1233, %int15_818 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
torch.bind_symbolic_shape %1234, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
%1235 = torch.aten.div.Tensor %1224, %55 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %1235, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%float-2.400000e02_819 = torch.constant.float -2.400000e+02
%float2.400000e02_820 = torch.constant.float 2.400000e+02
%1236 = torch.aten.clamp %1235, %float-2.400000e02_819, %float2.400000e02_820 : !torch.vtensor<[1,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %1236, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%int26_821 = torch.constant.int 26
%1237 = torch.prims.convert_element_type %1236, %int26_821 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %1237, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
%int-2_822 = torch.constant.int -2
%int-1_823 = torch.constant.int -1
%1238 = torch.aten.transpose.int %56, %int-2_822, %int-1_823 : !torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,1024],f8E4M3FNUZ>
%int4096_824 = torch.constant.int 4096
%1239 = torch.prim.ListConstruct %566, %int4096_824 : (!torch.int, !torch.int) -> !torch.list<int>
%1240 = torch.aten.view %1237, %1239 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %1240, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
%1241 = torch.aten.mm %1240, %1238 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,1024],f8E4M3FNUZ> -> !torch.vtensor<[?,1024],f8E4M3FNUZ>
torch.bind_symbolic_shape %1241, [%548], affine_map<()[s0] -> (s0 * 32, 1024)> : !torch.vtensor<[?,1024],f8E4M3FNUZ>
%int1_825 = torch.constant.int 1
%int1024_826 = torch.constant.int 1024
%1242 = torch.prim.ListConstruct %int1_825, %566, %int1024_826 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%1243 = torch.aten.view %1241, %1242 : !torch.vtensor<[?,1024],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
torch.bind_symbolic_shape %1243, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
%int15_827 = torch.constant.int 15
%1244 = torch.prims.convert_element_type %1243, %int15_827 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,1024],bf16>
torch.bind_symbolic_shape %1244, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],bf16>
%1245 = torch.aten.div.Tensor %1224, %57 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %1245, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%float-2.400000e02_828 = torch.constant.float -2.400000e+02
%float2.400000e02_829 = torch.constant.float 2.400000e+02
%1246 = torch.aten.clamp %1245, %float-2.400000e02_828, %float2.400000e02_829 : !torch.vtensor<[1,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %1246, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%int26_830 = torch.constant.int 26
%1247 = torch.prims.convert_element_type %1246, %int26_830 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %1247, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
%int-2_831 = torch.constant.int -2
%int-1_832 = torch.constant.int -1
%1248 = torch.aten.transpose.int %58, %int-2_831, %int-1_832 : !torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,1024],f8E4M3FNUZ>
%int4096_833 = torch.constant.int 4096
%1249 = torch.prim.ListConstruct %566, %int4096_833 : (!torch.int, !torch.int) -> !torch.list<int>
%1250 = torch.aten.view %1247, %1249 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %1250, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
%1251 = torch.aten.mm %1250, %1248 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,1024],f8E4M3FNUZ> -> !torch.vtensor<[?,1024],f8E4M3FNUZ>
torch.bind_symbolic_shape %1251, [%548], affine_map<()[s0] -> (s0 * 32, 1024)> : !torch.vtensor<[?,1024],f8E4M3FNUZ>
%int1_834 = torch.constant.int 1
%int1024_835 = torch.constant.int 1024
%1252 = torch.prim.ListConstruct %int1_834, %566, %int1024_835 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%1253 = torch.aten.view %1251, %1252 : !torch.vtensor<[?,1024],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
torch.bind_symbolic_shape %1253, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
%int15_836 = torch.constant.int 15
%1254 = torch.prims.convert_element_type %1253, %int15_836 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,1024],bf16>
torch.bind_symbolic_shape %1254, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],bf16>
%int1_837 = torch.constant.int 1
%int32_838 = torch.constant.int 32
%int128_839 = torch.constant.int 128
%1255 = torch.prim.ListConstruct %int1_837, %566, %int32_838, %int128_839 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%1256 = torch.aten.view %1234, %1255 : !torch.vtensor<[1,?,4096],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,32,128],bf16>
torch.bind_symbolic_shape %1256, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
%int1_840 = torch.constant.int 1
%int8_841 = torch.constant.int 8
%int128_842 = torch.constant.int 128
%1257 = torch.prim.ListConstruct %int1_840, %566, %int8_841, %int128_842 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%1258 = torch.aten.view %1244, %1257 : !torch.vtensor<[1,?,1024],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,8,128],bf16>
torch.bind_symbolic_shape %1258, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
%int1_843 = torch.constant.int 1
%int8_844 = torch.constant.int 8
%int128_845 = torch.constant.int 128
%1259 = torch.prim.ListConstruct %int1_843, %566, %int8_844, %int128_845 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%1260 = torch.aten.view %1254, %1259 : !torch.vtensor<[1,?,1024],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,8,128],bf16>
torch.bind_symbolic_shape %1260, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
%int131072_846 = torch.constant.int 131072
%none_847 = torch.constant.none
%none_848 = torch.constant.none
%cpu_849 = torch.constant.device "cpu"
%false_850 = torch.constant.bool false
%1261 = torch.aten.arange %int131072_846, %none_847, %none_848, %cpu_849, %false_850 : !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[131072],si64>
%int0_851 = torch.constant.int 0
%int128_852 = torch.constant.int 128
%none_853 = torch.constant.none
%none_854 = torch.constant.none
%cpu_855 = torch.constant.device "cpu"
%false_856 = torch.constant.bool false
%1262 = torch.aten.arange.start %int0_851, %int128_852, %none_853, %none_854, %cpu_855, %false_856 : !torch.int, !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[128],si64>
%int2_857 = torch.constant.int 2
%1263 = torch.aten.floor_divide.Scalar %1262, %int2_857 : !torch.vtensor<[128],si64>, !torch.int -> !torch.vtensor<[128],si64>
%int6_858 = torch.constant.int 6
%1264 = torch.prims.convert_element_type %1263, %int6_858 : !torch.vtensor<[128],si64>, !torch.int -> !torch.vtensor<[128],f32>
%int128_859 = torch.constant.int 128
%1265 = torch.aten.div.Scalar %1264, %int128_859 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[128],f32>
%float2.000000e00_860 = torch.constant.float 2.000000e+00
%1266 = torch.aten.mul.Scalar %1265, %float2.000000e00_860 : !torch.vtensor<[128],f32>, !torch.float -> !torch.vtensor<[128],f32>
%float5.000000e05_861 = torch.constant.float 5.000000e+05
%1267 = torch.aten.pow.Scalar %float5.000000e05_861, %1266 : !torch.float, !torch.vtensor<[128],f32> -> !torch.vtensor<[128],f32>
%1268 = torch.aten.reciprocal %1267 : !torch.vtensor<[128],f32> -> !torch.vtensor<[128],f32>
%float1.000000e00_862 = torch.constant.float 1.000000e+00
%1269 = torch.aten.mul.Scalar %1268, %float1.000000e00_862 : !torch.vtensor<[128],f32>, !torch.float -> !torch.vtensor<[128],f32>
%int131072_863 = torch.constant.int 131072
%int1_864 = torch.constant.int 1
%1270 = torch.prim.ListConstruct %int131072_863, %int1_864 : (!torch.int, !torch.int) -> !torch.list<int>
%1271 = torch.aten.view %1261, %1270 : !torch.vtensor<[131072],si64>, !torch.list<int> -> !torch.vtensor<[131072,1],si64>
%1272 = torch.aten.mul.Tensor %1271, %1269 : !torch.vtensor<[131072,1],si64>, !torch.vtensor<[128],f32> -> !torch.vtensor<[131072,128],f32>
%int1_865 = torch.constant.int 1
%1273 = torch.aten.size.int %1233, %int1_865 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.int
%int0_866 = torch.constant.int 0
%1274 = torch.aten.add.int %int0_866, %1273 : !torch.int, !torch.int -> !torch.int
%int0_867 = torch.constant.int 0
%int0_868 = torch.constant.int 0
%int1_869 = torch.constant.int 1
%1275 = torch.aten.slice.Tensor %1272, %int0_867, %int0_868, %1274, %int1_869 : !torch.vtensor<[131072,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
torch.bind_symbolic_shape %1275, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
%int1_870 = torch.constant.int 1
%int0_871 = torch.constant.int 0
%int9223372036854775807_872 = torch.constant.int 9223372036854775807
%int1_873 = torch.constant.int 1
%1276 = torch.aten.slice.Tensor %1275, %int1_870, %int0_871, %int9223372036854775807_872, %int1_873 : !torch.vtensor<[?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
torch.bind_symbolic_shape %1276, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
%int1_874 = torch.constant.int 1
%int0_875 = torch.constant.int 0
%int9223372036854775807_876 = torch.constant.int 9223372036854775807
%int1_877 = torch.constant.int 1
%1277 = torch.aten.slice.Tensor %1276, %int1_874, %int0_875, %int9223372036854775807_876, %int1_877 : !torch.vtensor<[?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
torch.bind_symbolic_shape %1277, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
%int0_878 = torch.constant.int 0
%1278 = torch.aten.unsqueeze %1277, %int0_878 : !torch.vtensor<[?,128],f32>, !torch.int -> !torch.vtensor<[1,?,128],f32>
torch.bind_symbolic_shape %1278, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
%int1_879 = torch.constant.int 1
%int0_880 = torch.constant.int 0
%int9223372036854775807_881 = torch.constant.int 9223372036854775807
%int1_882 = torch.constant.int 1
%1279 = torch.aten.slice.Tensor %1278, %int1_879, %int0_880, %int9223372036854775807_881, %int1_882 : !torch.vtensor<[1,?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],f32>
torch.bind_symbolic_shape %1279, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
%int2_883 = torch.constant.int 2
%int0_884 = torch.constant.int 0
%int9223372036854775807_885 = torch.constant.int 9223372036854775807
%int1_886 = torch.constant.int 1
%1280 = torch.aten.slice.Tensor %1279, %int2_883, %int0_884, %int9223372036854775807_885, %int1_886 : !torch.vtensor<[1,?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],f32>
torch.bind_symbolic_shape %1280, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
%int1_887 = torch.constant.int 1
%int1_888 = torch.constant.int 1
%int1_889 = torch.constant.int 1
%1281 = torch.prim.ListConstruct %int1_887, %int1_888, %int1_889 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%1282 = torch.aten.repeat %1280, %1281 : !torch.vtensor<[1,?,128],f32>, !torch.list<int> -> !torch.vtensor<[1,?,128],f32>
torch.bind_symbolic_shape %1282, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
%int6_890 = torch.constant.int 6
%1283 = torch.prims.convert_element_type %1256, %int6_890 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int -> !torch.vtensor<[1,?,32,128],f32>
torch.bind_symbolic_shape %1283, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
%1284 = torch_c.to_builtin_tensor %1283 : !torch.vtensor<[1,?,32,128],f32> -> tensor<1x?x32x128xf32>
%1285 = torch_c.to_builtin_tensor %1282 : !torch.vtensor<[1,?,128],f32> -> tensor<1x?x128xf32>
%1286 = util.call @sharktank_rotary_embedding_1_D_32_128_f32(%1284, %1285) : (tensor<1x?x32x128xf32>, tensor<1x?x128xf32>) -> tensor<1x?x32x128xf32>
%1287 = torch_c.from_builtin_tensor %1286 : tensor<1x?x32x128xf32> -> !torch.vtensor<[1,?,32,128],f32>
torch.bind_symbolic_shape %1287, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
%int15_891 = torch.constant.int 15
%1288 = torch.prims.convert_element_type %1287, %int15_891 : !torch.vtensor<[1,?,32,128],f32>, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
torch.bind_symbolic_shape %1288, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
%int131072_892 = torch.constant.int 131072
%none_893 = torch.constant.none
%none_894 = torch.constant.none
%cpu_895 = torch.constant.device "cpu"
%false_896 = torch.constant.bool false
%1289 = torch.aten.arange %int131072_892, %none_893, %none_894, %cpu_895, %false_896 : !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[131072],si64>
%int0_897 = torch.constant.int 0
%int128_898 = torch.constant.int 128
%none_899 = torch.constant.none
%none_900 = torch.constant.none
%cpu_901 = torch.constant.device "cpu"
%false_902 = torch.constant.bool false
%1290 = torch.aten.arange.start %int0_897, %int128_898, %none_899, %none_900, %cpu_901, %false_902 : !torch.int, !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[128],si64>
%int2_903 = torch.constant.int 2
%1291 = torch.aten.floor_divide.Scalar %1290, %int2_903 : !torch.vtensor<[128],si64>, !torch.int -> !torch.vtensor<[128],si64>
%int6_904 = torch.constant.int 6
%1292 = torch.prims.convert_element_type %1291, %int6_904 : !torch.vtensor<[128],si64>, !torch.int -> !torch.vtensor<[128],f32>
%int128_905 = torch.constant.int 128
%1293 = torch.aten.div.Scalar %1292, %int128_905 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[128],f32>
%float2.000000e00_906 = torch.constant.float 2.000000e+00
%1294 = torch.aten.mul.Scalar %1293, %float2.000000e00_906 : !torch.vtensor<[128],f32>, !torch.float -> !torch.vtensor<[128],f32>
%float5.000000e05_907 = torch.constant.float 5.000000e+05
%1295 = torch.aten.pow.Scalar %float5.000000e05_907, %1294 : !torch.float, !torch.vtensor<[128],f32> -> !torch.vtensor<[128],f32>
%1296 = torch.aten.reciprocal %1295 : !torch.vtensor<[128],f32> -> !torch.vtensor<[128],f32>
%float1.000000e00_908 = torch.constant.float 1.000000e+00
%1297 = torch.aten.mul.Scalar %1296, %float1.000000e00_908 : !torch.vtensor<[128],f32>, !torch.float -> !torch.vtensor<[128],f32>
%int131072_909 = torch.constant.int 131072
%int1_910 = torch.constant.int 1
%1298 = torch.prim.ListConstruct %int131072_909, %int1_910 : (!torch.int, !torch.int) -> !torch.list<int>
%1299 = torch.aten.view %1289, %1298 : !torch.vtensor<[131072],si64>, !torch.list<int> -> !torch.vtensor<[131072,1],si64>
%1300 = torch.aten.mul.Tensor %1299, %1297 : !torch.vtensor<[131072,1],si64>, !torch.vtensor<[128],f32> -> !torch.vtensor<[131072,128],f32>
%int1_911 = torch.constant.int 1
%1301 = torch.aten.size.int %1243, %int1_911 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.int
%int0_912 = torch.constant.int 0
%1302 = torch.aten.add.int %int0_912, %1301 : !torch.int, !torch.int -> !torch.int
%int0_913 = torch.constant.int 0
%int0_914 = torch.constant.int 0
%int1_915 = torch.constant.int 1
%1303 = torch.aten.slice.Tensor %1300, %int0_913, %int0_914, %1302, %int1_915 : !torch.vtensor<[131072,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
torch.bind_symbolic_shape %1303, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
%int1_916 = torch.constant.int 1
%int0_917 = torch.constant.int 0
%int9223372036854775807_918 = torch.constant.int 9223372036854775807
%int1_919 = torch.constant.int 1
%1304 = torch.aten.slice.Tensor %1303, %int1_916, %int0_917, %int9223372036854775807_918, %int1_919 : !torch.vtensor<[?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
torch.bind_symbolic_shape %1304, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
%int1_920 = torch.constant.int 1
%int0_921 = torch.constant.int 0
%int9223372036854775807_922 = torch.constant.int 9223372036854775807
%int1_923 = torch.constant.int 1
%1305 = torch.aten.slice.Tensor %1304, %int1_920, %int0_921, %int9223372036854775807_922, %int1_923 : !torch.vtensor<[?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
torch.bind_symbolic_shape %1305, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
%int0_924 = torch.constant.int 0
%1306 = torch.aten.unsqueeze %1305, %int0_924 : !torch.vtensor<[?,128],f32>, !torch.int -> !torch.vtensor<[1,?,128],f32>
torch.bind_symbolic_shape %1306, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
%int1_925 = torch.constant.int 1
%int0_926 = torch.constant.int 0
%int9223372036854775807_927 = torch.constant.int 9223372036854775807
%int1_928 = torch.constant.int 1
%1307 = torch.aten.slice.Tensor %1306, %int1_925, %int0_926, %int9223372036854775807_927, %int1_928 : !torch.vtensor<[1,?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],f32>
torch.bind_symbolic_shape %1307, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
%int2_929 = torch.constant.int 2
%int0_930 = torch.constant.int 0
%int9223372036854775807_931 = torch.constant.int 9223372036854775807
%int1_932 = torch.constant.int 1
%1308 = torch.aten.slice.Tensor %1307, %int2_929, %int0_930, %int9223372036854775807_931, %int1_932 : !torch.vtensor<[1,?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],f32>
torch.bind_symbolic_shape %1308, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
%int1_933 = torch.constant.int 1
%int1_934 = torch.constant.int 1
%int1_935 = torch.constant.int 1
%1309 = torch.prim.ListConstruct %int1_933, %int1_934, %int1_935 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%1310 = torch.aten.repeat %1308, %1309 : !torch.vtensor<[1,?,128],f32>, !torch.list<int> -> !torch.vtensor<[1,?,128],f32>
torch.bind_symbolic_shape %1310, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
%int6_936 = torch.constant.int 6
%1311 = torch.prims.convert_element_type %1258, %int6_936 : !torch.vtensor<[1,?,8,128],bf16>, !torch.int -> !torch.vtensor<[1,?,8,128],f32>
torch.bind_symbolic_shape %1311, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f32>
%1312 = torch_c.to_builtin_tensor %1311 : !torch.vtensor<[1,?,8,128],f32> -> tensor<1x?x8x128xf32>
%1313 = torch_c.to_builtin_tensor %1310 : !torch.vtensor<[1,?,128],f32> -> tensor<1x?x128xf32>
%1314 = util.call @sharktank_rotary_embedding_1_D_8_128_f32(%1312, %1313) : (tensor<1x?x8x128xf32>, tensor<1x?x128xf32>) -> tensor<1x?x8x128xf32>
%1315 = torch_c.from_builtin_tensor %1314 : tensor<1x?x8x128xf32> -> !torch.vtensor<[1,?,8,128],f32>
torch.bind_symbolic_shape %1315, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f32>
%int15_937 = torch.constant.int 15
%1316 = torch.prims.convert_element_type %1315, %int15_937 : !torch.vtensor<[1,?,8,128],f32>, !torch.int -> !torch.vtensor<[1,?,8,128],bf16>
torch.bind_symbolic_shape %1316, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
%1317 = torch.aten.div.Tensor %1316, %59 : !torch.vtensor<[1,?,8,128],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,8,128],bf16>
torch.bind_symbolic_shape %1317, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
%float-2.400000e02_938 = torch.constant.float -2.400000e+02
%float2.400000e02_939 = torch.constant.float 2.400000e+02
%1318 = torch.aten.clamp %1317, %float-2.400000e02_938, %float2.400000e02_939 : !torch.vtensor<[1,?,8,128],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,8,128],bf16>
torch.bind_symbolic_shape %1318, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
%int26_940 = torch.constant.int 26
%1319 = torch.prims.convert_element_type %1318, %int26_940 : !torch.vtensor<[1,?,8,128],bf16>, !torch.int -> !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %1319, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
%1320 = torch.aten.div.Tensor %1260, %59 : !torch.vtensor<[1,?,8,128],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,8,128],bf16>
torch.bind_symbolic_shape %1320, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
%float-2.400000e02_941 = torch.constant.float -2.400000e+02
%float2.400000e02_942 = torch.constant.float 2.400000e+02
%1321 = torch.aten.clamp %1320, %float-2.400000e02_941, %float2.400000e02_942 : !torch.vtensor<[1,?,8,128],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,8,128],bf16>
torch.bind_symbolic_shape %1321, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
%int26_943 = torch.constant.int 26
%1322 = torch.prims.convert_element_type %1321, %int26_943 : !torch.vtensor<[1,?,8,128],bf16>, !torch.int -> !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %1322, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
%int64_944 = torch.constant.int 64
%1323 = torch.aten.mul.Scalar %arg2, %int64_944 : !torch.vtensor<[1,?],si64>, !torch.int -> !torch.vtensor<[1,?],si64>
torch.bind_symbolic_shape %1323, [%548], affine_map<()[s0] -> (1, s0)> : !torch.vtensor<[1,?],si64>
%int6_945 = torch.constant.int 6
%int1_946 = torch.constant.int 1
%1324 = torch.aten.add.Scalar %1323, %int6_945, %int1_946 : !torch.vtensor<[1,?],si64>, !torch.int, !torch.int -> !torch.vtensor<[1,?],si64>
torch.bind_symbolic_shape %1324, [%548], affine_map<()[s0] -> (1, s0)> : !torch.vtensor<[1,?],si64>
%int1_947 = torch.constant.int 1
%int32_948 = torch.constant.int 32
%int8_949 = torch.constant.int 8
%int128_950 = torch.constant.int 128
%1325 = torch.prim.ListConstruct %int1_947, %670, %int32_948, %int8_949, %int128_950 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%1326 = torch.aten.view %1319, %1325 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %1326, [%548], affine_map<()[s0] -> (1, s0, 32, 8, 128)> : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
%int32_951 = torch.constant.int 32
%int8_952 = torch.constant.int 8
%int128_953 = torch.constant.int 128
%1327 = torch.prim.ListConstruct %670, %int32_951, %int8_952, %int128_953 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%1328 = torch.aten.view %1326, %1327 : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %1328, [%548], affine_map<()[s0] -> (s0, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
%1329 = torch.prim.ListConstruct %670 : (!torch.int) -> !torch.list<int>
%1330 = torch.aten.view %1324, %1329 : !torch.vtensor<[1,?],si64>, !torch.list<int> -> !torch.vtensor<[?],si64>
torch.bind_symbolic_shape %1330, [%548], affine_map<()[s0] -> (s0)> : !torch.vtensor<[?],si64>
%int32_954 = torch.constant.int 32
%int2_955 = torch.constant.int 2
%int32_956 = torch.constant.int 32
%int8_957 = torch.constant.int 8
%int128_958 = torch.constant.int 128
%1331 = torch.prim.ListConstruct %661, %int32_954, %int2_955, %int32_956, %int8_957, %int128_958 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%1332 = torch.aten.view %1139, %1331 : !torch.vtensor<[?,2097152],f16>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f16>
torch.bind_symbolic_shape %1332, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f16>
%int32_959 = torch.constant.int 32
%1333 = torch.aten.mul.int %661, %int32_959 : !torch.int, !torch.int -> !torch.int
%int2_960 = torch.constant.int 2
%1334 = torch.aten.mul.int %1333, %int2_960 : !torch.int, !torch.int -> !torch.int
%int32_961 = torch.constant.int 32
%int8_962 = torch.constant.int 8
%int128_963 = torch.constant.int 128
%1335 = torch.prim.ListConstruct %1334, %int32_961, %int8_962, %int128_963 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%1336 = torch.aten.view %1332, %1335 : !torch.vtensor<[?,32,2,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f16>
torch.bind_symbolic_shape %1336, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f16>
%1337 = torch.prim.ListConstruct %1330 : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
%false_964 = torch.constant.bool false
%1338 = torch.aten.index_put %1336, %1337, %1328, %false_964 : !torch.vtensor<[?,32,8,128],f16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool -> !torch.vtensor<[?,32,8,128],f16>
torch.bind_symbolic_shape %1338, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f16>
%int32_965 = torch.constant.int 32
%int2_966 = torch.constant.int 2
%int32_967 = torch.constant.int 32
%int8_968 = torch.constant.int 8
%int128_969 = torch.constant.int 128
%1339 = torch.prim.ListConstruct %661, %int32_965, %int2_966, %int32_967, %int8_968, %int128_969 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%1340 = torch.aten.view %1338, %1339 : !torch.vtensor<[?,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f16>
torch.bind_symbolic_shape %1340, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f16>
%int2097152_970 = torch.constant.int 2097152
%1341 = torch.prim.ListConstruct %661, %int2097152_970 : (!torch.int, !torch.int) -> !torch.list<int>
%1342 = torch.aten.view %1340, %1341 : !torch.vtensor<[?,32,2,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,2097152],f16>
torch.bind_symbolic_shape %1342, [%549], affine_map<()[s0] -> (s0, 2097152)> : !torch.vtensor<[?,2097152],f16>
%int32_971 = torch.constant.int 32
%int2_972 = torch.constant.int 2
%int32_973 = torch.constant.int 32
%int8_974 = torch.constant.int 8
%int128_975 = torch.constant.int 128
%1343 = torch.prim.ListConstruct %661, %int32_971, %int2_972, %int32_973, %int8_974, %int128_975 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%1344 = torch.aten.view %1342, %1343 : !torch.vtensor<[?,2097152],f16>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f16>
torch.bind_symbolic_shape %1344, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f16>
%int32_976 = torch.constant.int 32
%int8_977 = torch.constant.int 8
%int128_978 = torch.constant.int 128
%1345 = torch.prim.ListConstruct %1334, %int32_976, %int8_977, %int128_978 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%1346 = torch.aten.view %1344, %1345 : !torch.vtensor<[?,32,2,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f16>
torch.bind_symbolic_shape %1346, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f16>
%int1_979 = torch.constant.int 1
%int32_980 = torch.constant.int 32
%int8_981 = torch.constant.int 8
%int128_982 = torch.constant.int 128
%1347 = torch.prim.ListConstruct %int1_979, %670, %int32_980, %int8_981, %int128_982 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%1348 = torch.aten.view %1322, %1347 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %1348, [%548], affine_map<()[s0] -> (1, s0, 32, 8, 128)> : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
%int32_983 = torch.constant.int 32
%int8_984 = torch.constant.int 8
%int128_985 = torch.constant.int 128
%1349 = torch.prim.ListConstruct %670, %int32_983, %int8_984, %int128_985 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%1350 = torch.aten.view %1348, %1349 : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %1350, [%548], affine_map<()[s0] -> (s0, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
%int1_986 = torch.constant.int 1
%int1_987 = torch.constant.int 1
%1351 = torch.aten.add.Scalar %1324, %int1_986, %int1_987 : !torch.vtensor<[1,?],si64>, !torch.int, !torch.int -> !torch.vtensor<[1,?],si64>
torch.bind_symbolic_shape %1351, [%548], affine_map<()[s0] -> (1, s0)> : !torch.vtensor<[1,?],si64>
%1352 = torch.prim.ListConstruct %670 : (!torch.int) -> !torch.list<int>
%1353 = torch.aten.view %1351, %1352 : !torch.vtensor<[1,?],si64>, !torch.list<int> -> !torch.vtensor<[?],si64>
torch.bind_symbolic_shape %1353, [%548], affine_map<()[s0] -> (s0)> : !torch.vtensor<[?],si64>
%1354 = torch.prim.ListConstruct %1353 : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
%false_988 = torch.constant.bool false
%1355 = torch.aten.index_put %1346, %1354, %1350, %false_988 : !torch.vtensor<[?,32,8,128],f16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool -> !torch.vtensor<[?,32,8,128],f16>
torch.bind_symbolic_shape %1355, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f16>
%int32_989 = torch.constant.int 32
%int2_990 = torch.constant.int 2
%int32_991 = torch.constant.int 32
%int8_992 = torch.constant.int 8
%int128_993 = torch.constant.int 128
%1356 = torch.prim.ListConstruct %661, %int32_989, %int2_990, %int32_991, %int8_992, %int128_993 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%1357 = torch.aten.view %1355, %1356 : !torch.vtensor<[?,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f16>
torch.bind_symbolic_shape %1357, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f16>
%int2097152_994 = torch.constant.int 2097152
%1358 = torch.prim.ListConstruct %661, %int2097152_994 : (!torch.int, !torch.int) -> !torch.list<int>
%1359 = torch.aten.view %1357, %1358 : !torch.vtensor<[?,32,2,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,2097152],f16>
torch.bind_symbolic_shape %1359, [%549], affine_map<()[s0] -> (s0, 2097152)> : !torch.vtensor<[?,2097152],f16>
%int-2_995 = torch.constant.int -2
%1360 = torch.aten.unsqueeze %1319, %int-2_995 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %1360, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 1, 128)> : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
%int1_996 = torch.constant.int 1
%int8_997 = torch.constant.int 8
%int4_998 = torch.constant.int 4
%int128_999 = torch.constant.int 128
%1361 = torch.prim.ListConstruct %int1_996, %1301, %int8_997, %int4_998, %int128_999 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%false_1000 = torch.constant.bool false
%1362 = torch.aten.expand %1360, %1361, %false_1000 : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %1362, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
%int0_1001 = torch.constant.int 0
%1363 = torch.aten.clone %1362, %int0_1001 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %1363, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
%int1_1002 = torch.constant.int 1
%int32_1003 = torch.constant.int 32
%int128_1004 = torch.constant.int 128
%1364 = torch.prim.ListConstruct %int1_1002, %1301, %int32_1003, %int128_1004 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%1365 = torch.aten._unsafe_view %1363, %1364 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %1365, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
%int-2_1005 = torch.constant.int -2
%1366 = torch.aten.unsqueeze %1322, %int-2_1005 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %1366, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 1, 128)> : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
%int1_1006 = torch.constant.int 1
%1367 = torch.aten.size.int %1253, %int1_1006 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.int
%int1_1007 = torch.constant.int 1
%int8_1008 = torch.constant.int 8
%int4_1009 = torch.constant.int 4
%int128_1010 = torch.constant.int 128
%1368 = torch.prim.ListConstruct %int1_1007, %1367, %int8_1008, %int4_1009, %int128_1010 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%false_1011 = torch.constant.bool false
%1369 = torch.aten.expand %1366, %1368, %false_1011 : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %1369, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
%int0_1012 = torch.constant.int 0
%1370 = torch.aten.clone %1369, %int0_1012 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %1370, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
%int1_1013 = torch.constant.int 1
%int32_1014 = torch.constant.int 32
%int128_1015 = torch.constant.int 128
%1371 = torch.prim.ListConstruct %int1_1013, %1367, %int32_1014, %int128_1015 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%1372 = torch.aten._unsafe_view %1370, %1371 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %1372, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
%int6_1016 = torch.constant.int 6
%1373 = torch.prims.convert_element_type %1365, %int6_1016 : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,32,128],f32>
torch.bind_symbolic_shape %1373, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
%1374 = torch.aten.mul.Tensor %1373, %59 : !torch.vtensor<[1,?,32,128],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,32,128],f32>
torch.bind_symbolic_shape %1374, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
%int15_1017 = torch.constant.int 15
%1375 = torch.prims.convert_element_type %1374, %int15_1017 : !torch.vtensor<[1,?,32,128],f32>, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
torch.bind_symbolic_shape %1375, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
%int6_1018 = torch.constant.int 6
%1376 = torch.prims.convert_element_type %1372, %int6_1018 : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,32,128],f32>
torch.bind_symbolic_shape %1376, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
%1377 = torch.aten.mul.Tensor %1376, %59 : !torch.vtensor<[1,?,32,128],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,32,128],f32>
torch.bind_symbolic_shape %1377, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
%int15_1019 = torch.constant.int 15
%1378 = torch.prims.convert_element_type %1377, %int15_1019 : !torch.vtensor<[1,?,32,128],f32>, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
torch.bind_symbolic_shape %1378, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
%int1_1020 = torch.constant.int 1
%int2_1021 = torch.constant.int 2
%1379 = torch.aten.transpose.int %1288, %int1_1020, %int2_1021 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
torch.bind_symbolic_shape %1379, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
%int1_1022 = torch.constant.int 1
%int2_1023 = torch.constant.int 2
%1380 = torch.aten.transpose.int %1375, %int1_1022, %int2_1023 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
torch.bind_symbolic_shape %1380, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
%int1_1024 = torch.constant.int 1
%int2_1025 = torch.constant.int 2
%1381 = torch.aten.transpose.int %1378, %int1_1024, %int2_1025 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
torch.bind_symbolic_shape %1381, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
%float0.000000e00_1026 = torch.constant.float 0.000000e+00
%true_1027 = torch.constant.bool true
%none_1028 = torch.constant.none
%none_1029 = torch.constant.none
%1382:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%1379, %1380, %1381, %float0.000000e00_1026, %true_1027, %none_1028, %none_1029) : (!torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,32,?,128],bf16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,32,?],f32>)
torch.bind_symbolic_shape %1382#0, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
%int1_1030 = torch.constant.int 1
%int2_1031 = torch.constant.int 2
%1383 = torch.aten.transpose.int %1382#0, %int1_1030, %int2_1031 : !torch.vtensor<[1,32,?,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
torch.bind_symbolic_shape %1383, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
%int1_1032 = torch.constant.int 1
%int4096_1033 = torch.constant.int 4096
%1384 = torch.prim.ListConstruct %int1_1032, %1273, %int4096_1033 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%1385 = torch.aten.view %1383, %1384 : !torch.vtensor<[1,?,32,128],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,4096],bf16>
torch.bind_symbolic_shape %1385, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
%1386 = torch.aten.div.Tensor %1385, %60 : !torch.vtensor<[1,?,4096],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],bf16>
torch.bind_symbolic_shape %1386, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
%float-2.400000e02_1034 = torch.constant.float -2.400000e+02
%float2.400000e02_1035 = torch.constant.float 2.400000e+02
%1387 = torch.aten.clamp %1386, %float-2.400000e02_1034, %float2.400000e02_1035 : !torch.vtensor<[1,?,4096],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],bf16>
torch.bind_symbolic_shape %1387, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
%int26_1036 = torch.constant.int 26
%1388 = torch.prims.convert_element_type %1387, %int26_1036 : !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %1388, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
%int-2_1037 = torch.constant.int -2
%int-1_1038 = torch.constant.int -1
%1389 = torch.aten.transpose.int %61, %int-2_1037, %int-1_1038 : !torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%int4096_1039 = torch.constant.int 4096
%1390 = torch.prim.ListConstruct %1273, %int4096_1039 : (!torch.int, !torch.int) -> !torch.list<int>
%1391 = torch.aten.view %1388, %1390 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %1391, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
%1392 = torch.aten.mm %1391, %1389 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %1392, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
%int1_1040 = torch.constant.int 1
%int4096_1041 = torch.constant.int 4096
%1393 = torch.prim.ListConstruct %int1_1040, %1273, %int4096_1041 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%1394 = torch.aten.view %1392, %1393 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %1394, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
%int15_1042 = torch.constant.int 15
%1395 = torch.prims.convert_element_type %1394, %int15_1042 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
torch.bind_symbolic_shape %1395, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
%int1_1043 = torch.constant.int 1
%1396 = torch.aten.add.Tensor %1217, %1395, %int1_1043 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %1396, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%int2_1044 = torch.constant.int 2
%1397 = torch.aten.pow.Tensor_Scalar %1396, %int2_1044 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %1397, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%int-1_1045 = torch.constant.int -1
%1398 = torch.prim.ListConstruct %int-1_1045 : (!torch.int) -> !torch.list<int>
%true_1046 = torch.constant.bool true
%none_1047 = torch.constant.none
%1399 = torch.aten.mean.dim %1397, %1398, %true_1046, %none_1047 : !torch.vtensor<[1,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,?,1],f32>
torch.bind_symbolic_shape %1399, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
%float1.000000e-05_1048 = torch.constant.float 1.000000e-05
%int1_1049 = torch.constant.int 1
%1400 = torch.aten.add.Scalar %1399, %float1.000000e-05_1048, %int1_1049 : !torch.vtensor<[1,?,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,?,1],f32>
torch.bind_symbolic_shape %1400, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
%1401 = torch.aten.rsqrt %1400 : !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,1],f32>
torch.bind_symbolic_shape %1401, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
%1402 = torch.aten.mul.Tensor %1396, %1401 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %1402, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%1403 = torch.aten.mul.Tensor %62, %1402 : !torch.vtensor<[4096],bf16>, !torch.vtensor<[1,?,4096],f32> -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %1403, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%1404 = torch.aten.div.Tensor %1403, %63 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %1404, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%float-2.400000e02_1050 = torch.constant.float -2.400000e+02
%float2.400000e02_1051 = torch.constant.float 2.400000e+02
%1405 = torch.aten.clamp %1404, %float-2.400000e02_1050, %float2.400000e02_1051 : !torch.vtensor<[1,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %1405, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%int26_1052 = torch.constant.int 26
%1406 = torch.prims.convert_element_type %1405, %int26_1052 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %1406, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
%int-2_1053 = torch.constant.int -2
%int-1_1054 = torch.constant.int -1
%1407 = torch.aten.transpose.int %64, %int-2_1053, %int-1_1054 : !torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
%int4096_1055 = torch.constant.int 4096
%1408 = torch.prim.ListConstruct %566, %int4096_1055 : (!torch.int, !torch.int) -> !torch.list<int>
%1409 = torch.aten.view %1406, %1408 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %1409, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
%1410 = torch.aten.mm %1409, %1407 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,14336],f8E4M3FNUZ> -> !torch.vtensor<[?,14336],f8E4M3FNUZ>
torch.bind_symbolic_shape %1410, [%548], affine_map<()[s0] -> (s0 * 32, 14336)> : !torch.vtensor<[?,14336],f8E4M3FNUZ>
%int1_1056 = torch.constant.int 1
%int14336_1057 = torch.constant.int 14336
%1411 = torch.prim.ListConstruct %int1_1056, %566, %int14336_1057 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%1412 = torch.aten.view %1410, %1411 : !torch.vtensor<[?,14336],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
torch.bind_symbolic_shape %1412, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
%int15_1058 = torch.constant.int 15
%1413 = torch.prims.convert_element_type %1412, %int15_1058 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,14336],bf16>
torch.bind_symbolic_shape %1413, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
%1414 = torch.aten.silu %1413 : !torch.vtensor<[1,?,14336],bf16> -> !torch.vtensor<[1,?,14336],bf16>
torch.bind_symbolic_shape %1414, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
%1415 = torch.aten.div.Tensor %1403, %65 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %1415, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%float-2.400000e02_1059 = torch.constant.float -2.400000e+02
%float2.400000e02_1060 = torch.constant.float 2.400000e+02
%1416 = torch.aten.clamp %1415, %float-2.400000e02_1059, %float2.400000e02_1060 : !torch.vtensor<[1,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %1416, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%int26_1061 = torch.constant.int 26
%1417 = torch.prims.convert_element_type %1416, %int26_1061 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %1417, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
%int-2_1062 = torch.constant.int -2
%int-1_1063 = torch.constant.int -1
%1418 = torch.aten.transpose.int %66, %int-2_1062, %int-1_1063 : !torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
%int4096_1064 = torch.constant.int 4096
%1419 = torch.prim.ListConstruct %566, %int4096_1064 : (!torch.int, !torch.int) -> !torch.list<int>
%1420 = torch.aten.view %1417, %1419 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %1420, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
%1421 = torch.aten.mm %1420, %1418 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,14336],f8E4M3FNUZ> -> !torch.vtensor<[?,14336],f8E4M3FNUZ>
torch.bind_symbolic_shape %1421, [%548], affine_map<()[s0] -> (s0 * 32, 14336)> : !torch.vtensor<[?,14336],f8E4M3FNUZ>
%int1_1065 = torch.constant.int 1
%int14336_1066 = torch.constant.int 14336
%1422 = torch.prim.ListConstruct %int1_1065, %566, %int14336_1066 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%1423 = torch.aten.view %1421, %1422 : !torch.vtensor<[?,14336],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
torch.bind_symbolic_shape %1423, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
%int15_1067 = torch.constant.int 15
%1424 = torch.prims.convert_element_type %1423, %int15_1067 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,14336],bf16>
torch.bind_symbolic_shape %1424, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
%1425 = torch.aten.mul.Tensor %1414, %1424 : !torch.vtensor<[1,?,14336],bf16>, !torch.vtensor<[1,?,14336],bf16> -> !torch.vtensor<[1,?,14336],bf16>
torch.bind_symbolic_shape %1425, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
%1426 = torch.aten.div.Tensor %1425, %67 : !torch.vtensor<[1,?,14336],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,14336],bf16>
torch.bind_symbolic_shape %1426, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
%float-2.400000e02_1068 = torch.constant.float -2.400000e+02
%float2.400000e02_1069 = torch.constant.float 2.400000e+02
%1427 = torch.aten.clamp %1426, %float-2.400000e02_1068, %float2.400000e02_1069 : !torch.vtensor<[1,?,14336],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,14336],bf16>
torch.bind_symbolic_shape %1427, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
%int26_1070 = torch.constant.int 26
%1428 = torch.prims.convert_element_type %1427, %int26_1070 : !torch.vtensor<[1,?,14336],bf16>, !torch.int -> !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
torch.bind_symbolic_shape %1428, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
%int-2_1071 = torch.constant.int -2
%int-1_1072 = torch.constant.int -1
%1429 = torch.aten.transpose.int %68, %int-2_1071, %int-1_1072 : !torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%int1_1073 = torch.constant.int 1
%1430 = torch.aten.size.int %1412, %int1_1073 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.int -> !torch.int
%int14336_1074 = torch.constant.int 14336
%1431 = torch.prim.ListConstruct %1430, %int14336_1074 : (!torch.int, !torch.int) -> !torch.list<int>
%1432 = torch.aten.view %1428, %1431 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,14336],f8E4M3FNUZ>
torch.bind_symbolic_shape %1432, [%548], affine_map<()[s0] -> (s0 * 32, 14336)> : !torch.vtensor<[?,14336],f8E4M3FNUZ>
%1433 = torch.aten.mm %1432, %1429 : !torch.vtensor<[?,14336],f8E4M3FNUZ>, !torch.vtensor<[14336,4096],f8E4M3FNUZ> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %1433, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
%int1_1075 = torch.constant.int 1
%int4096_1076 = torch.constant.int 4096
%1434 = torch.prim.ListConstruct %int1_1075, %1430, %int4096_1076 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%1435 = torch.aten.view %1433, %1434 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %1435, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
%int15_1077 = torch.constant.int 15
%1436 = torch.prims.convert_element_type %1435, %int15_1077 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
torch.bind_symbolic_shape %1436, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
%int1_1078 = torch.constant.int 1
%1437 = torch.aten.add.Tensor %1396, %1436, %int1_1078 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %1437, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%int2_1079 = torch.constant.int 2
%1438 = torch.aten.pow.Tensor_Scalar %1437, %int2_1079 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %1438, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%int-1_1080 = torch.constant.int -1
%1439 = torch.prim.ListConstruct %int-1_1080 : (!torch.int) -> !torch.list<int>
%true_1081 = torch.constant.bool true
%none_1082 = torch.constant.none
%1440 = torch.aten.mean.dim %1438, %1439, %true_1081, %none_1082 : !torch.vtensor<[1,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,?,1],f32>
torch.bind_symbolic_shape %1440, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
%float1.000000e-05_1083 = torch.constant.float 1.000000e-05
%int1_1084 = torch.constant.int 1
%1441 = torch.aten.add.Scalar %1440, %float1.000000e-05_1083, %int1_1084 : !torch.vtensor<[1,?,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,?,1],f32>
torch.bind_symbolic_shape %1441, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
%1442 = torch.aten.rsqrt %1441 : !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,1],f32>
torch.bind_symbolic_shape %1442, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
%1443 = torch.aten.mul.Tensor %1437, %1442 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %1443, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%1444 = torch.aten.mul.Tensor %69, %1443 : !torch.vtensor<[4096],bf16>, !torch.vtensor<[1,?,4096],f32> -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %1444, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%1445 = torch.aten.div.Tensor %1444, %70 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %1445, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%float-2.400000e02_1085 = torch.constant.float -2.400000e+02
%float2.400000e02_1086 = torch.constant.float 2.400000e+02
%1446 = torch.aten.clamp %1445, %float-2.400000e02_1085, %float2.400000e02_1086 : !torch.vtensor<[1,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %1446, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%int26_1087 = torch.constant.int 26
%1447 = torch.prims.convert_element_type %1446, %int26_1087 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %1447, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
%int-2_1088 = torch.constant.int -2
%int-1_1089 = torch.constant.int -1
%1448 = torch.aten.transpose.int %71, %int-2_1088, %int-1_1089 : !torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%int4096_1090 = torch.constant.int 4096
%1449 = torch.prim.ListConstruct %566, %int4096_1090 : (!torch.int, !torch.int) -> !torch.list<int>
%1450 = torch.aten.view %1447, %1449 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %1450, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
%1451 = torch.aten.mm %1450, %1448 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %1451, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
%int1_1091 = torch.constant.int 1
%int4096_1092 = torch.constant.int 4096
%1452 = torch.prim.ListConstruct %int1_1091, %566, %int4096_1092 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%1453 = torch.aten.view %1451, %1452 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %1453, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
%int15_1093 = torch.constant.int 15
%1454 = torch.prims.convert_element_type %1453, %int15_1093 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
torch.bind_symbolic_shape %1454, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
%1455 = torch.aten.div.Tensor %1444, %72 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %1455, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%float-2.400000e02_1094 = torch.constant.float -2.400000e+02
%float2.400000e02_1095 = torch.constant.float 2.400000e+02
%1456 = torch.aten.clamp %1455, %float-2.400000e02_1094, %float2.400000e02_1095 : !torch.vtensor<[1,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %1456, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%int26_1096 = torch.constant.int 26
%1457 = torch.prims.convert_element_type %1456, %int26_1096 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %1457, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
%int-2_1097 = torch.constant.int -2
%int-1_1098 = torch.constant.int -1
%1458 = torch.aten.transpose.int %73, %int-2_1097, %int-1_1098 : !torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,1024],f8E4M3FNUZ>
%int4096_1099 = torch.constant.int 4096
%1459 = torch.prim.ListConstruct %566, %int4096_1099 : (!torch.int, !torch.int) -> !torch.list<int>
%1460 = torch.aten.view %1457, %1459 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %1460, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
%1461 = torch.aten.mm %1460, %1458 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,1024],f8E4M3FNUZ> -> !torch.vtensor<[?,1024],f8E4M3FNUZ>
torch.bind_symbolic_shape %1461, [%548], affine_map<()[s0] -> (s0 * 32, 1024)> : !torch.vtensor<[?,1024],f8E4M3FNUZ>
%int1_1100 = torch.constant.int 1
%int1024_1101 = torch.constant.int 1024
%1462 = torch.prim.ListConstruct %int1_1100, %566, %int1024_1101 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%1463 = torch.aten.view %1461, %1462 : !torch.vtensor<[?,1024],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
torch.bind_symbolic_shape %1463, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
%int15_1102 = torch.constant.int 15
%1464 = torch.prims.convert_element_type %1463, %int15_1102 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,1024],bf16>
torch.bind_symbolic_shape %1464, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],bf16>
%1465 = torch.aten.div.Tensor %1444, %74 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %1465, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%float-2.400000e02_1103 = torch.constant.float -2.400000e+02
%float2.400000e02_1104 = torch.constant.float 2.400000e+02
%1466 = torch.aten.clamp %1465, %float-2.400000e02_1103, %float2.400000e02_1104 : !torch.vtensor<[1,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %1466, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%int26_1105 = torch.constant.int 26
%1467 = torch.prims.convert_element_type %1466, %int26_1105 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %1467, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
%int-2_1106 = torch.constant.int -2
%int-1_1107 = torch.constant.int -1
%1468 = torch.aten.transpose.int %75, %int-2_1106, %int-1_1107 : !torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,1024],f8E4M3FNUZ>
%int4096_1108 = torch.constant.int 4096
%1469 = torch.prim.ListConstruct %566, %int4096_1108 : (!torch.int, !torch.int) -> !torch.list<int>
%1470 = torch.aten.view %1467, %1469 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %1470, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
%1471 = torch.aten.mm %1470, %1468 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,1024],f8E4M3FNUZ> -> !torch.vtensor<[?,1024],f8E4M3FNUZ>
torch.bind_symbolic_shape %1471, [%548], affine_map<()[s0] -> (s0 * 32, 1024)> : !torch.vtensor<[?,1024],f8E4M3FNUZ>
%int1_1109 = torch.constant.int 1
%int1024_1110 = torch.constant.int 1024
%1472 = torch.prim.ListConstruct %int1_1109, %566, %int1024_1110 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%1473 = torch.aten.view %1471, %1472 : !torch.vtensor<[?,1024],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
torch.bind_symbolic_shape %1473, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
%int15_1111 = torch.constant.int 15
%1474 = torch.prims.convert_element_type %1473, %int15_1111 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,1024],bf16>
torch.bind_symbolic_shape %1474, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],bf16>
%int1_1112 = torch.constant.int 1
%int32_1113 = torch.constant.int 32
%int128_1114 = torch.constant.int 128
%1475 = torch.prim.ListConstruct %int1_1112, %566, %int32_1113, %int128_1114 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%1476 = torch.aten.view %1454, %1475 : !torch.vtensor<[1,?,4096],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,32,128],bf16>
torch.bind_symbolic_shape %1476, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
%int1_1115 = torch.constant.int 1
%int8_1116 = torch.constant.int 8
%int128_1117 = torch.constant.int 128
%1477 = torch.prim.ListConstruct %int1_1115, %566, %int8_1116, %int128_1117 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%1478 = torch.aten.view %1464, %1477 : !torch.vtensor<[1,?,1024],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,8,128],bf16>
torch.bind_symbolic_shape %1478, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
%int1_1118 = torch.constant.int 1
%int8_1119 = torch.constant.int 8
%int128_1120 = torch.constant.int 128
%1479 = torch.prim.ListConstruct %int1_1118, %566, %int8_1119, %int128_1120 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%1480 = torch.aten.view %1474, %1479 : !torch.vtensor<[1,?,1024],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,8,128],bf16>
torch.bind_symbolic_shape %1480, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
%int131072_1121 = torch.constant.int 131072
%none_1122 = torch.constant.none
%none_1123 = torch.constant.none
%cpu_1124 = torch.constant.device "cpu"
%false_1125 = torch.constant.bool false
%1481 = torch.aten.arange %int131072_1121, %none_1122, %none_1123, %cpu_1124, %false_1125 : !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[131072],si64>
%int0_1126 = torch.constant.int 0
%int128_1127 = torch.constant.int 128
%none_1128 = torch.constant.none
%none_1129 = torch.constant.none
%cpu_1130 = torch.constant.device "cpu"
%false_1131 = torch.constant.bool false
%1482 = torch.aten.arange.start %int0_1126, %int128_1127, %none_1128, %none_1129, %cpu_1130, %false_1131 : !torch.int, !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[128],si64>
%int2_1132 = torch.constant.int 2
%1483 = torch.aten.floor_divide.Scalar %1482, %int2_1132 : !torch.vtensor<[128],si64>, !torch.int -> !torch.vtensor<[128],si64>
%int6_1133 = torch.constant.int 6
%1484 = torch.prims.convert_element_type %1483, %int6_1133 : !torch.vtensor<[128],si64>, !torch.int -> !torch.vtensor<[128],f32>
%int128_1134 = torch.constant.int 128
%1485 = torch.aten.div.Scalar %1484, %int128_1134 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[128],f32>
%float2.000000e00_1135 = torch.constant.float 2.000000e+00
%1486 = torch.aten.mul.Scalar %1485, %float2.000000e00_1135 : !torch.vtensor<[128],f32>, !torch.float -> !torch.vtensor<[128],f32>
%float5.000000e05_1136 = torch.constant.float 5.000000e+05
%1487 = torch.aten.pow.Scalar %float5.000000e05_1136, %1486 : !torch.float, !torch.vtensor<[128],f32> -> !torch.vtensor<[128],f32>
%1488 = torch.aten.reciprocal %1487 : !torch.vtensor<[128],f32> -> !torch.vtensor<[128],f32>
%float1.000000e00_1137 = torch.constant.float 1.000000e+00
%1489 = torch.aten.mul.Scalar %1488, %float1.000000e00_1137 : !torch.vtensor<[128],f32>, !torch.float -> !torch.vtensor<[128],f32>
%int131072_1138 = torch.constant.int 131072
%int1_1139 = torch.constant.int 1
%1490 = torch.prim.ListConstruct %int131072_1138, %int1_1139 : (!torch.int, !torch.int) -> !torch.list<int>
%1491 = torch.aten.view %1481, %1490 : !torch.vtensor<[131072],si64>, !torch.list<int> -> !torch.vtensor<[131072,1],si64>
%1492 = torch.aten.mul.Tensor %1491, %1489 : !torch.vtensor<[131072,1],si64>, !torch.vtensor<[128],f32> -> !torch.vtensor<[131072,128],f32>
%int1_1140 = torch.constant.int 1
%1493 = torch.aten.size.int %1453, %int1_1140 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.int
%int0_1141 = torch.constant.int 0
%1494 = torch.aten.add.int %int0_1141, %1493 : !torch.int, !torch.int -> !torch.int
%int0_1142 = torch.constant.int 0
%int0_1143 = torch.constant.int 0
%int1_1144 = torch.constant.int 1
%1495 = torch.aten.slice.Tensor %1492, %int0_1142, %int0_1143, %1494, %int1_1144 : !torch.vtensor<[131072,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
torch.bind_symbolic_shape %1495, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
%int1_1145 = torch.constant.int 1
%int0_1146 = torch.constant.int 0
%int9223372036854775807_1147 = torch.constant.int 9223372036854775807
%int1_1148 = torch.constant.int 1
%1496 = torch.aten.slice.Tensor %1495, %int1_1145, %int0_1146, %int9223372036854775807_1147, %int1_1148 : !torch.vtensor<[?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
torch.bind_symbolic_shape %1496, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
%int1_1149 = torch.constant.int 1
%int0_1150 = torch.constant.int 0
%int9223372036854775807_1151 = torch.constant.int 9223372036854775807
%int1_1152 = torch.constant.int 1
%1497 = torch.aten.slice.Tensor %1496, %int1_1149, %int0_1150, %int9223372036854775807_1151, %int1_1152 : !torch.vtensor<[?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
torch.bind_symbolic_shape %1497, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
%int0_1153 = torch.constant.int 0
%1498 = torch.aten.unsqueeze %1497, %int0_1153 : !torch.vtensor<[?,128],f32>, !torch.int -> !torch.vtensor<[1,?,128],f32>
torch.bind_symbolic_shape %1498, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
%int1_1154 = torch.constant.int 1
%int0_1155 = torch.constant.int 0
%int9223372036854775807_1156 = torch.constant.int 9223372036854775807
%int1_1157 = torch.constant.int 1
%1499 = torch.aten.slice.Tensor %1498, %int1_1154, %int0_1155, %int9223372036854775807_1156, %int1_1157 : !torch.vtensor<[1,?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],f32>
torch.bind_symbolic_shape %1499, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
%int2_1158 = torch.constant.int 2
%int0_1159 = torch.constant.int 0
%int9223372036854775807_1160 = torch.constant.int 9223372036854775807
%int1_1161 = torch.constant.int 1
%1500 = torch.aten.slice.Tensor %1499, %int2_1158, %int0_1159, %int9223372036854775807_1160, %int1_1161 : !torch.vtensor<[1,?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],f32>
torch.bind_symbolic_shape %1500, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
%int1_1162 = torch.constant.int 1
%int1_1163 = torch.constant.int 1
%int1_1164 = torch.constant.int 1
%1501 = torch.prim.ListConstruct %int1_1162, %int1_1163, %int1_1164 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%1502 = torch.aten.repeat %1500, %1501 : !torch.vtensor<[1,?,128],f32>, !torch.list<int> -> !torch.vtensor<[1,?,128],f32>
torch.bind_symbolic_shape %1502, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
%int6_1165 = torch.constant.int 6
%1503 = torch.prims.convert_element_type %1476, %int6_1165 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int -> !torch.vtensor<[1,?,32,128],f32>
torch.bind_symbolic_shape %1503, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
%1504 = torch_c.to_builtin_tensor %1503 : !torch.vtensor<[1,?,32,128],f32> -> tensor<1x?x32x128xf32>
%1505 = torch_c.to_builtin_tensor %1502 : !torch.vtensor<[1,?,128],f32> -> tensor<1x?x128xf32>
%1506 = util.call @sharktank_rotary_embedding_1_D_32_128_f32(%1504, %1505) : (tensor<1x?x32x128xf32>, tensor<1x?x128xf32>) -> tensor<1x?x32x128xf32>
%1507 = torch_c.from_builtin_tensor %1506 : tensor<1x?x32x128xf32> -> !torch.vtensor<[1,?,32,128],f32>
torch.bind_symbolic_shape %1507, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
%int15_1166 = torch.constant.int 15
%1508 = torch.prims.convert_element_type %1507, %int15_1166 : !torch.vtensor<[1,?,32,128],f32>, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
torch.bind_symbolic_shape %1508, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
%int131072_1167 = torch.constant.int 131072
%none_1168 = torch.constant.none
%none_1169 = torch.constant.none
%cpu_1170 = torch.constant.device "cpu"
%false_1171 = torch.constant.bool false
%1509 = torch.aten.arange %int131072_1167, %none_1168, %none_1169, %cpu_1170, %false_1171 : !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[131072],si64>
%int0_1172 = torch.constant.int 0
%int128_1173 = torch.constant.int 128
%none_1174 = torch.constant.none
%none_1175 = torch.constant.none
%cpu_1176 = torch.constant.device "cpu"
%false_1177 = torch.constant.bool false
%1510 = torch.aten.arange.start %int0_1172, %int128_1173, %none_1174, %none_1175, %cpu_1176, %false_1177 : !torch.int, !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[128],si64>
%int2_1178 = torch.constant.int 2
%1511 = torch.aten.floor_divide.Scalar %1510, %int2_1178 : !torch.vtensor<[128],si64>, !torch.int -> !torch.vtensor<[128],si64>
%int6_1179 = torch.constant.int 6
%1512 = torch.prims.convert_element_type %1511, %int6_1179 : !torch.vtensor<[128],si64>, !torch.int -> !torch.vtensor<[128],f32>
%int128_1180 = torch.constant.int 128
%1513 = torch.aten.div.Scalar %1512, %int128_1180 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[128],f32>
%float2.000000e00_1181 = torch.constant.float 2.000000e+00
%1514 = torch.aten.mul.Scalar %1513, %float2.000000e00_1181 : !torch.vtensor<[128],f32>, !torch.float -> !torch.vtensor<[128],f32>
%float5.000000e05_1182 = torch.constant.float 5.000000e+05
%1515 = torch.aten.pow.Scalar %float5.000000e05_1182, %1514 : !torch.float, !torch.vtensor<[128],f32> -> !torch.vtensor<[128],f32>
%1516 = torch.aten.reciprocal %1515 : !torch.vtensor<[128],f32> -> !torch.vtensor<[128],f32>
%float1.000000e00_1183 = torch.constant.float 1.000000e+00
%1517 = torch.aten.mul.Scalar %1516, %float1.000000e00_1183 : !torch.vtensor<[128],f32>, !torch.float -> !torch.vtensor<[128],f32>
%int131072_1184 = torch.constant.int 131072
%int1_1185 = torch.constant.int 1
%1518 = torch.prim.ListConstruct %int131072_1184, %int1_1185 : (!torch.int, !torch.int) -> !torch.list<int>
%1519 = torch.aten.view %1509, %1518 : !torch.vtensor<[131072],si64>, !torch.list<int> -> !torch.vtensor<[131072,1],si64>
%1520 = torch.aten.mul.Tensor %1519, %1517 : !torch.vtensor<[131072,1],si64>, !torch.vtensor<[128],f32> -> !torch.vtensor<[131072,128],f32>
%int1_1186 = torch.constant.int 1
%1521 = torch.aten.size.int %1463, %int1_1186 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.int
%int0_1187 = torch.constant.int 0
%1522 = torch.aten.add.int %int0_1187, %1521 : !torch.int, !torch.int -> !torch.int
%int0_1188 = torch.constant.int 0
%int0_1189 = torch.constant.int 0
%int1_1190 = torch.constant.int 1
%1523 = torch.aten.slice.Tensor %1520, %int0_1188, %int0_1189, %1522, %int1_1190 : !torch.vtensor<[131072,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
torch.bind_symbolic_shape %1523, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
%int1_1191 = torch.constant.int 1
%int0_1192 = torch.constant.int 0
%int9223372036854775807_1193 = torch.constant.int 9223372036854775807
%int1_1194 = torch.constant.int 1
%1524 = torch.aten.slice.Tensor %1523, %int1_1191, %int0_1192, %int9223372036854775807_1193, %int1_1194 : !torch.vtensor<[?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
torch.bind_symbolic_shape %1524, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
%int1_1195 = torch.constant.int 1
%int0_1196 = torch.constant.int 0
%int9223372036854775807_1197 = torch.constant.int 9223372036854775807
%int1_1198 = torch.constant.int 1
%1525 = torch.aten.slice.Tensor %1524, %int1_1195, %int0_1196, %int9223372036854775807_1197, %int1_1198 : !torch.vtensor<[?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
torch.bind_symbolic_shape %1525, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
%int0_1199 = torch.constant.int 0
%1526 = torch.aten.unsqueeze %1525, %int0_1199 : !torch.vtensor<[?,128],f32>, !torch.int -> !torch.vtensor<[1,?,128],f32>
torch.bind_symbolic_shape %1526, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
%int1_1200 = torch.constant.int 1
%int0_1201 = torch.constant.int 0
%int9223372036854775807_1202 = torch.constant.int 9223372036854775807
%int1_1203 = torch.constant.int 1
%1527 = torch.aten.slice.Tensor %1526, %int1_1200, %int0_1201, %int9223372036854775807_1202, %int1_1203 : !torch.vtensor<[1,?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],f32>
torch.bind_symbolic_shape %1527, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
%int2_1204 = torch.constant.int 2
%int0_1205 = torch.constant.int 0
%int9223372036854775807_1206 = torch.constant.int 9223372036854775807
%int1_1207 = torch.constant.int 1
%1528 = torch.aten.slice.Tensor %1527, %int2_1204, %int0_1205, %int9223372036854775807_1206, %int1_1207 : !torch.vtensor<[1,?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],f32>
torch.bind_symbolic_shape %1528, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
%int1_1208 = torch.constant.int 1
%int1_1209 = torch.constant.int 1
%int1_1210 = torch.constant.int 1
%1529 = torch.prim.ListConstruct %int1_1208, %int1_1209, %int1_1210 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%1530 = torch.aten.repeat %1528, %1529 : !torch.vtensor<[1,?,128],f32>, !torch.list<int> -> !torch.vtensor<[1,?,128],f32>
torch.bind_symbolic_shape %1530, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
%int6_1211 = torch.constant.int 6
%1531 = torch.prims.convert_element_type %1478, %int6_1211 : !torch.vtensor<[1,?,8,128],bf16>, !torch.int -> !torch.vtensor<[1,?,8,128],f32>
torch.bind_symbolic_shape %1531, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f32>
%1532 = torch_c.to_builtin_tensor %1531 : !torch.vtensor<[1,?,8,128],f32> -> tensor<1x?x8x128xf32>
%1533 = torch_c.to_builtin_tensor %1530 : !torch.vtensor<[1,?,128],f32> -> tensor<1x?x128xf32>
%1534 = util.call @sharktank_rotary_embedding_1_D_8_128_f32(%1532, %1533) : (tensor<1x?x8x128xf32>, tensor<1x?x128xf32>) -> tensor<1x?x8x128xf32>
%1535 = torch_c.from_builtin_tensor %1534 : tensor<1x?x8x128xf32> -> !torch.vtensor<[1,?,8,128],f32>
torch.bind_symbolic_shape %1535, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f32>
%int15_1212 = torch.constant.int 15
%1536 = torch.prims.convert_element_type %1535, %int15_1212 : !torch.vtensor<[1,?,8,128],f32>, !torch.int -> !torch.vtensor<[1,?,8,128],bf16>
torch.bind_symbolic_shape %1536, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
%1537 = torch.aten.div.Tensor %1536, %76 : !torch.vtensor<[1,?,8,128],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,8,128],bf16>
torch.bind_symbolic_shape %1537, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
%float-2.400000e02_1213 = torch.constant.float -2.400000e+02
%float2.400000e02_1214 = torch.constant.float 2.400000e+02
%1538 = torch.aten.clamp %1537, %float-2.400000e02_1213, %float2.400000e02_1214 : !torch.vtensor<[1,?,8,128],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,8,128],bf16>
torch.bind_symbolic_shape %1538, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
%int26_1215 = torch.constant.int 26
%1539 = torch.prims.convert_element_type %1538, %int26_1215 : !torch.vtensor<[1,?,8,128],bf16>, !torch.int -> !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %1539, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
%1540 = torch.aten.div.Tensor %1480, %76 : !torch.vtensor<[1,?,8,128],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,8,128],bf16>
torch.bind_symbolic_shape %1540, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
%float-2.400000e02_1216 = torch.constant.float -2.400000e+02
%float2.400000e02_1217 = torch.constant.float 2.400000e+02
%1541 = torch.aten.clamp %1540, %float-2.400000e02_1216, %float2.400000e02_1217 : !torch.vtensor<[1,?,8,128],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,8,128],bf16>
torch.bind_symbolic_shape %1541, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
%int26_1218 = torch.constant.int 26
%1542 = torch.prims.convert_element_type %1541, %int26_1218 : !torch.vtensor<[1,?,8,128],bf16>, !torch.int -> !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %1542, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
%int64_1219 = torch.constant.int 64
%1543 = torch.aten.mul.Scalar %arg2, %int64_1219 : !torch.vtensor<[1,?],si64>, !torch.int -> !torch.vtensor<[1,?],si64>
torch.bind_symbolic_shape %1543, [%548], affine_map<()[s0] -> (1, s0)> : !torch.vtensor<[1,?],si64>
%int8_1220 = torch.constant.int 8
%int1_1221 = torch.constant.int 1
%1544 = torch.aten.add.Scalar %1543, %int8_1220, %int1_1221 : !torch.vtensor<[1,?],si64>, !torch.int, !torch.int -> !torch.vtensor<[1,?],si64>
torch.bind_symbolic_shape %1544, [%548], affine_map<()[s0] -> (1, s0)> : !torch.vtensor<[1,?],si64>
%int1_1222 = torch.constant.int 1
%int32_1223 = torch.constant.int 32
%int8_1224 = torch.constant.int 8
%int128_1225 = torch.constant.int 128
%1545 = torch.prim.ListConstruct %int1_1222, %670, %int32_1223, %int8_1224, %int128_1225 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%1546 = torch.aten.view %1539, %1545 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %1546, [%548], affine_map<()[s0] -> (1, s0, 32, 8, 128)> : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
%int32_1226 = torch.constant.int 32
%int8_1227 = torch.constant.int 8
%int128_1228 = torch.constant.int 128
%1547 = torch.prim.ListConstruct %670, %int32_1226, %int8_1227, %int128_1228 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%1548 = torch.aten.view %1546, %1547 : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %1548, [%548], affine_map<()[s0] -> (s0, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
%1549 = torch.prim.ListConstruct %670 : (!torch.int) -> !torch.list<int>
%1550 = torch.aten.view %1544, %1549 : !torch.vtensor<[1,?],si64>, !torch.list<int> -> !torch.vtensor<[?],si64>
torch.bind_symbolic_shape %1550, [%548], affine_map<()[s0] -> (s0)> : !torch.vtensor<[?],si64>
%int32_1229 = torch.constant.int 32
%int2_1230 = torch.constant.int 2
%int32_1231 = torch.constant.int 32
%int8_1232 = torch.constant.int 8
%int128_1233 = torch.constant.int 128
%1551 = torch.prim.ListConstruct %661, %int32_1229, %int2_1230, %int32_1231, %int8_1232, %int128_1233 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%1552 = torch.aten.view %1359, %1551 : !torch.vtensor<[?,2097152],f16>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f16>
torch.bind_symbolic_shape %1552, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f16>
%int32_1234 = torch.constant.int 32
%1553 = torch.aten.mul.int %661, %int32_1234 : !torch.int, !torch.int -> !torch.int
%int2_1235 = torch.constant.int 2
%1554 = torch.aten.mul.int %1553, %int2_1235 : !torch.int, !torch.int -> !torch.int
%int32_1236 = torch.constant.int 32
%int8_1237 = torch.constant.int 8
%int128_1238 = torch.constant.int 128
%1555 = torch.prim.ListConstruct %1554, %int32_1236, %int8_1237, %int128_1238 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%1556 = torch.aten.view %1552, %1555 : !torch.vtensor<[?,32,2,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f16>
torch.bind_symbolic_shape %1556, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f16>
%1557 = torch.prim.ListConstruct %1550 : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
%false_1239 = torch.constant.bool false
%1558 = torch.aten.index_put %1556, %1557, %1548, %false_1239 : !torch.vtensor<[?,32,8,128],f16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool -> !torch.vtensor<[?,32,8,128],f16>
torch.bind_symbolic_shape %1558, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f16>
%int32_1240 = torch.constant.int 32
%int2_1241 = torch.constant.int 2
%int32_1242 = torch.constant.int 32
%int8_1243 = torch.constant.int 8
%int128_1244 = torch.constant.int 128
%1559 = torch.prim.ListConstruct %661, %int32_1240, %int2_1241, %int32_1242, %int8_1243, %int128_1244 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%1560 = torch.aten.view %1558, %1559 : !torch.vtensor<[?,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f16>
torch.bind_symbolic_shape %1560, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f16>
%int2097152_1245 = torch.constant.int 2097152
%1561 = torch.prim.ListConstruct %661, %int2097152_1245 : (!torch.int, !torch.int) -> !torch.list<int>
%1562 = torch.aten.view %1560, %1561 : !torch.vtensor<[?,32,2,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,2097152],f16>
torch.bind_symbolic_shape %1562, [%549], affine_map<()[s0] -> (s0, 2097152)> : !torch.vtensor<[?,2097152],f16>
%int32_1246 = torch.constant.int 32
%int2_1247 = torch.constant.int 2
%int32_1248 = torch.constant.int 32
%int8_1249 = torch.constant.int 8
%int128_1250 = torch.constant.int 128
%1563 = torch.prim.ListConstruct %661, %int32_1246, %int2_1247, %int32_1248, %int8_1249, %int128_1250 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%1564 = torch.aten.view %1562, %1563 : !torch.vtensor<[?,2097152],f16>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f16>
torch.bind_symbolic_shape %1564, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f16>
%int32_1251 = torch.constant.int 32
%int8_1252 = torch.constant.int 8
%int128_1253 = torch.constant.int 128
%1565 = torch.prim.ListConstruct %1554, %int32_1251, %int8_1252, %int128_1253 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%1566 = torch.aten.view %1564, %1565 : !torch.vtensor<[?,32,2,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f16>
torch.bind_symbolic_shape %1566, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f16>
%int1_1254 = torch.constant.int 1
%int32_1255 = torch.constant.int 32
%int8_1256 = torch.constant.int 8
%int128_1257 = torch.constant.int 128
%1567 = torch.prim.ListConstruct %int1_1254, %670, %int32_1255, %int8_1256, %int128_1257 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%1568 = torch.aten.view %1542, %1567 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %1568, [%548], affine_map<()[s0] -> (1, s0, 32, 8, 128)> : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
%int32_1258 = torch.constant.int 32
%int8_1259 = torch.constant.int 8
%int128_1260 = torch.constant.int 128
%1569 = torch.prim.ListConstruct %670, %int32_1258, %int8_1259, %int128_1260 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%1570 = torch.aten.view %1568, %1569 : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %1570, [%548], affine_map<()[s0] -> (s0, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
%int1_1261 = torch.constant.int 1
%int1_1262 = torch.constant.int 1
%1571 = torch.aten.add.Scalar %1544, %int1_1261, %int1_1262 : !torch.vtensor<[1,?],si64>, !torch.int, !torch.int -> !torch.vtensor<[1,?],si64>
torch.bind_symbolic_shape %1571, [%548], affine_map<()[s0] -> (1, s0)> : !torch.vtensor<[1,?],si64>
%1572 = torch.prim.ListConstruct %670 : (!torch.int) -> !torch.list<int>
%1573 = torch.aten.view %1571, %1572 : !torch.vtensor<[1,?],si64>, !torch.list<int> -> !torch.vtensor<[?],si64>
torch.bind_symbolic_shape %1573, [%548], affine_map<()[s0] -> (s0)> : !torch.vtensor<[?],si64>
%1574 = torch.prim.ListConstruct %1573 : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
%false_1263 = torch.constant.bool false
%1575 = torch.aten.index_put %1566, %1574, %1570, %false_1263 : !torch.vtensor<[?,32,8,128],f16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool -> !torch.vtensor<[?,32,8,128],f16>
torch.bind_symbolic_shape %1575, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f16>
%int32_1264 = torch.constant.int 32
%int2_1265 = torch.constant.int 2
%int32_1266 = torch.constant.int 32
%int8_1267 = torch.constant.int 8
%int128_1268 = torch.constant.int 128
%1576 = torch.prim.ListConstruct %661, %int32_1264, %int2_1265, %int32_1266, %int8_1267, %int128_1268 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%1577 = torch.aten.view %1575, %1576 : !torch.vtensor<[?,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f16>
torch.bind_symbolic_shape %1577, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f16>
%int2097152_1269 = torch.constant.int 2097152
%1578 = torch.prim.ListConstruct %661, %int2097152_1269 : (!torch.int, !torch.int) -> !torch.list<int>
%1579 = torch.aten.view %1577, %1578 : !torch.vtensor<[?,32,2,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,2097152],f16>
torch.bind_symbolic_shape %1579, [%549], affine_map<()[s0] -> (s0, 2097152)> : !torch.vtensor<[?,2097152],f16>
%int-2_1270 = torch.constant.int -2
%1580 = torch.aten.unsqueeze %1539, %int-2_1270 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %1580, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 1, 128)> : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
%int1_1271 = torch.constant.int 1
%int8_1272 = torch.constant.int 8
%int4_1273 = torch.constant.int 4
%int128_1274 = torch.constant.int 128
%1581 = torch.prim.ListConstruct %int1_1271, %1521, %int8_1272, %int4_1273, %int128_1274 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%false_1275 = torch.constant.bool false
%1582 = torch.aten.expand %1580, %1581, %false_1275 : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %1582, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
%int0_1276 = torch.constant.int 0
%1583 = torch.aten.clone %1582, %int0_1276 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %1583, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
%int1_1277 = torch.constant.int 1
%int32_1278 = torch.constant.int 32
%int128_1279 = torch.constant.int 128
%1584 = torch.prim.ListConstruct %int1_1277, %1521, %int32_1278, %int128_1279 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%1585 = torch.aten._unsafe_view %1583, %1584 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %1585, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
%int-2_1280 = torch.constant.int -2
%1586 = torch.aten.unsqueeze %1542, %int-2_1280 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %1586, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 1, 128)> : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
%int1_1281 = torch.constant.int 1
%1587 = torch.aten.size.int %1473, %int1_1281 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.int
%int1_1282 = torch.constant.int 1
%int8_1283 = torch.constant.int 8
%int4_1284 = torch.constant.int 4
%int128_1285 = torch.constant.int 128
%1588 = torch.prim.ListConstruct %int1_1282, %1587, %int8_1283, %int4_1284, %int128_1285 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%false_1286 = torch.constant.bool false
%1589 = torch.aten.expand %1586, %1588, %false_1286 : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %1589, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
%int0_1287 = torch.constant.int 0
%1590 = torch.aten.clone %1589, %int0_1287 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %1590, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
%int1_1288 = torch.constant.int 1
%int32_1289 = torch.constant.int 32
%int128_1290 = torch.constant.int 128
%1591 = torch.prim.ListConstruct %int1_1288, %1587, %int32_1289, %int128_1290 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%1592 = torch.aten._unsafe_view %1590, %1591 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %1592, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
%int6_1291 = torch.constant.int 6
%1593 = torch.prims.convert_element_type %1585, %int6_1291 : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,32,128],f32>
torch.bind_symbolic_shape %1593, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
%1594 = torch.aten.mul.Tensor %1593, %76 : !torch.vtensor<[1,?,32,128],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,32,128],f32>
torch.bind_symbolic_shape %1594, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
%int15_1292 = torch.constant.int 15
%1595 = torch.prims.convert_element_type %1594, %int15_1292 : !torch.vtensor<[1,?,32,128],f32>, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
torch.bind_symbolic_shape %1595, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
%int6_1293 = torch.constant.int 6
%1596 = torch.prims.convert_element_type %1592, %int6_1293 : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,32,128],f32>
torch.bind_symbolic_shape %1596, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
%1597 = torch.aten.mul.Tensor %1596, %76 : !torch.vtensor<[1,?,32,128],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,32,128],f32>
torch.bind_symbolic_shape %1597, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
%int15_1294 = torch.constant.int 15
%1598 = torch.prims.convert_element_type %1597, %int15_1294 : !torch.vtensor<[1,?,32,128],f32>, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
torch.bind_symbolic_shape %1598, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
%int1_1295 = torch.constant.int 1
%int2_1296 = torch.constant.int 2
%1599 = torch.aten.transpose.int %1508, %int1_1295, %int2_1296 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
torch.bind_symbolic_shape %1599, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
%int1_1297 = torch.constant.int 1
%int2_1298 = torch.constant.int 2
%1600 = torch.aten.transpose.int %1595, %int1_1297, %int2_1298 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
torch.bind_symbolic_shape %1600, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
%int1_1299 = torch.constant.int 1
%int2_1300 = torch.constant.int 2
%1601 = torch.aten.transpose.int %1598, %int1_1299, %int2_1300 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
torch.bind_symbolic_shape %1601, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
%float0.000000e00_1301 = torch.constant.float 0.000000e+00
%true_1302 = torch.constant.bool true
%none_1303 = torch.constant.none
%none_1304 = torch.constant.none
%1602:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%1599, %1600, %1601, %float0.000000e00_1301, %true_1302, %none_1303, %none_1304) : (!torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,32,?,128],bf16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,32,?],f32>)
torch.bind_symbolic_shape %1602#0, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
%int1_1305 = torch.constant.int 1
%int2_1306 = torch.constant.int 2
%1603 = torch.aten.transpose.int %1602#0, %int1_1305, %int2_1306 : !torch.vtensor<[1,32,?,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
torch.bind_symbolic_shape %1603, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
%int1_1307 = torch.constant.int 1
%int4096_1308 = torch.constant.int 4096
%1604 = torch.prim.ListConstruct %int1_1307, %1493, %int4096_1308 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%1605 = torch.aten.view %1603, %1604 : !torch.vtensor<[1,?,32,128],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,4096],bf16>
torch.bind_symbolic_shape %1605, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
%1606 = torch.aten.div.Tensor %1605, %77 : !torch.vtensor<[1,?,4096],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],bf16>
torch.bind_symbolic_shape %1606, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
%float-2.400000e02_1309 = torch.constant.float -2.400000e+02
%float2.400000e02_1310 = torch.constant.float 2.400000e+02
%1607 = torch.aten.clamp %1606, %float-2.400000e02_1309, %float2.400000e02_1310 : !torch.vtensor<[1,?,4096],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],bf16>
torch.bind_symbolic_shape %1607, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
%int26_1311 = torch.constant.int 26
%1608 = torch.prims.convert_element_type %1607, %int26_1311 : !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %1608, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
%int-2_1312 = torch.constant.int -2
%int-1_1313 = torch.constant.int -1
%1609 = torch.aten.transpose.int %78, %int-2_1312, %int-1_1313 : !torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%int4096_1314 = torch.constant.int 4096
%1610 = torch.prim.ListConstruct %1493, %int4096_1314 : (!torch.int, !torch.int) -> !torch.list<int>
%1611 = torch.aten.view %1608, %1610 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %1611, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
%1612 = torch.aten.mm %1611, %1609 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %1612, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
%int1_1315 = torch.constant.int 1
%int4096_1316 = torch.constant.int 4096
%1613 = torch.prim.ListConstruct %int1_1315, %1493, %int4096_1316 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%1614 = torch.aten.view %1612, %1613 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %1614, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
%int15_1317 = torch.constant.int 15
%1615 = torch.prims.convert_element_type %1614, %int15_1317 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
torch.bind_symbolic_shape %1615, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
%int1_1318 = torch.constant.int 1
%1616 = torch.aten.add.Tensor %1437, %1615, %int1_1318 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %1616, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%int2_1319 = torch.constant.int 2
%1617 = torch.aten.pow.Tensor_Scalar %1616, %int2_1319 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %1617, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%int-1_1320 = torch.constant.int -1
%1618 = torch.prim.ListConstruct %int-1_1320 : (!torch.int) -> !torch.list<int>
%true_1321 = torch.constant.bool true
%none_1322 = torch.constant.none
%1619 = torch.aten.mean.dim %1617, %1618, %true_1321, %none_1322 : !torch.vtensor<[1,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,?,1],f32>
torch.bind_symbolic_shape %1619, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
%float1.000000e-05_1323 = torch.constant.float 1.000000e-05
%int1_1324 = torch.constant.int 1
%1620 = torch.aten.add.Scalar %1619, %float1.000000e-05_1323, %int1_1324 : !torch.vtensor<[1,?,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,?,1],f32>
torch.bind_symbolic_shape %1620, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
%1621 = torch.aten.rsqrt %1620 : !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,1],f32>
torch.bind_symbolic_shape %1621, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
%1622 = torch.aten.mul.Tensor %1616, %1621 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %1622, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%1623 = torch.aten.mul.Tensor %79, %1622 : !torch.vtensor<[4096],bf16>, !torch.vtensor<[1,?,4096],f32> -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %1623, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%1624 = torch.aten.div.Tensor %1623, %80 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %1624, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%float-2.400000e02_1325 = torch.constant.float -2.400000e+02
%float2.400000e02_1326 = torch.constant.float 2.400000e+02
%1625 = torch.aten.clamp %1624, %float-2.400000e02_1325, %float2.400000e02_1326 : !torch.vtensor<[1,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %1625, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%int26_1327 = torch.constant.int 26
%1626 = torch.prims.convert_element_type %1625, %int26_1327 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %1626, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
%int-2_1328 = torch.constant.int -2
%int-1_1329 = torch.constant.int -1
%1627 = torch.aten.transpose.int %81, %int-2_1328, %int-1_1329 : !torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
%int4096_1330 = torch.constant.int 4096
%1628 = torch.prim.ListConstruct %566, %int4096_1330 : (!torch.int, !torch.int) -> !torch.list<int>
%1629 = torch.aten.view %1626, %1628 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %1629, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
%1630 = torch.aten.mm %1629, %1627 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,14336],f8E4M3FNUZ> -> !torch.vtensor<[?,14336],f8E4M3FNUZ>
torch.bind_symbolic_shape %1630, [%548], affine_map<()[s0] -> (s0 * 32, 14336)> : !torch.vtensor<[?,14336],f8E4M3FNUZ>
%int1_1331 = torch.constant.int 1
%int14336_1332 = torch.constant.int 14336
%1631 = torch.prim.ListConstruct %int1_1331, %566, %int14336_1332 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%1632 = torch.aten.view %1630, %1631 : !torch.vtensor<[?,14336],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
torch.bind_symbolic_shape %1632, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
%int15_1333 = torch.constant.int 15
%1633 = torch.prims.convert_element_type %1632, %int15_1333 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,14336],bf16>
torch.bind_symbolic_shape %1633, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
%1634 = torch.aten.silu %1633 : !torch.vtensor<[1,?,14336],bf16> -> !torch.vtensor<[1,?,14336],bf16>
torch.bind_symbolic_shape %1634, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
%1635 = torch.aten.div.Tensor %1623, %82 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %1635, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%float-2.400000e02_1334 = torch.constant.float -2.400000e+02
%float2.400000e02_1335 = torch.constant.float 2.400000e+02
%1636 = torch.aten.clamp %1635, %float-2.400000e02_1334, %float2.400000e02_1335 : !torch.vtensor<[1,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %1636, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%int26_1336 = torch.constant.int 26
%1637 = torch.prims.convert_element_type %1636, %int26_1336 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %1637, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
%int-2_1337 = torch.constant.int -2
%int-1_1338 = torch.constant.int -1
%1638 = torch.aten.transpose.int %83, %int-2_1337, %int-1_1338 : !torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
%int4096_1339 = torch.constant.int 4096
%1639 = torch.prim.ListConstruct %566, %int4096_1339 : (!torch.int, !torch.int) -> !torch.list<int>
%1640 = torch.aten.view %1637, %1639 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %1640, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
%1641 = torch.aten.mm %1640, %1638 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,14336],f8E4M3FNUZ> -> !torch.vtensor<[?,14336],f8E4M3FNUZ>
torch.bind_symbolic_shape %1641, [%548], affine_map<()[s0] -> (s0 * 32, 14336)> : !torch.vtensor<[?,14336],f8E4M3FNUZ>
%int1_1340 = torch.constant.int 1
%int14336_1341 = torch.constant.int 14336
%1642 = torch.prim.ListConstruct %int1_1340, %566, %int14336_1341 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%1643 = torch.aten.view %1641, %1642 : !torch.vtensor<[?,14336],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
torch.bind_symbolic_shape %1643, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
%int15_1342 = torch.constant.int 15
%1644 = torch.prims.convert_element_type %1643, %int15_1342 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,14336],bf16>
torch.bind_symbolic_shape %1644, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
%1645 = torch.aten.mul.Tensor %1634, %1644 : !torch.vtensor<[1,?,14336],bf16>, !torch.vtensor<[1,?,14336],bf16> -> !torch.vtensor<[1,?,14336],bf16>
torch.bind_symbolic_shape %1645, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
%1646 = torch.aten.div.Tensor %1645, %84 : !torch.vtensor<[1,?,14336],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,14336],bf16>
torch.bind_symbolic_shape %1646, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
%float-2.400000e02_1343 = torch.constant.float -2.400000e+02
%float2.400000e02_1344 = torch.constant.float 2.400000e+02
%1647 = torch.aten.clamp %1646, %float-2.400000e02_1343, %float2.400000e02_1344 : !torch.vtensor<[1,?,14336],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,14336],bf16>
torch.bind_symbolic_shape %1647, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
%int26_1345 = torch.constant.int 26
%1648 = torch.prims.convert_element_type %1647, %int26_1345 : !torch.vtensor<[1,?,14336],bf16>, !torch.int -> !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
torch.bind_symbolic_shape %1648, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
%int-2_1346 = torch.constant.int -2
%int-1_1347 = torch.constant.int -1
%1649 = torch.aten.transpose.int %85, %int-2_1346, %int-1_1347 : !torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%int1_1348 = torch.constant.int 1
%1650 = torch.aten.size.int %1632, %int1_1348 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.int -> !torch.int
%int14336_1349 = torch.constant.int 14336
%1651 = torch.prim.ListConstruct %1650, %int14336_1349 : (!torch.int, !torch.int) -> !torch.list<int>
%1652 = torch.aten.view %1648, %1651 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,14336],f8E4M3FNUZ>
torch.bind_symbolic_shape %1652, [%548], affine_map<()[s0] -> (s0 * 32, 14336)> : !torch.vtensor<[?,14336],f8E4M3FNUZ>
%1653 = torch.aten.mm %1652, %1649 : !torch.vtensor<[?,14336],f8E4M3FNUZ>, !torch.vtensor<[14336,4096],f8E4M3FNUZ> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %1653, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
%int1_1350 = torch.constant.int 1
%int4096_1351 = torch.constant.int 4096
%1654 = torch.prim.ListConstruct %int1_1350, %1650, %int4096_1351 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%1655 = torch.aten.view %1653, %1654 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %1655, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
%int15_1352 = torch.constant.int 15
%1656 = torch.prims.convert_element_type %1655, %int15_1352 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
torch.bind_symbolic_shape %1656, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
%int1_1353 = torch.constant.int 1
%1657 = torch.aten.add.Tensor %1616, %1656, %int1_1353 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %1657, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%int2_1354 = torch.constant.int 2
%1658 = torch.aten.pow.Tensor_Scalar %1657, %int2_1354 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %1658, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%int-1_1355 = torch.constant.int -1
%1659 = torch.prim.ListConstruct %int-1_1355 : (!torch.int) -> !torch.list<int>
%true_1356 = torch.constant.bool true
%none_1357 = torch.constant.none
%1660 = torch.aten.mean.dim %1658, %1659, %true_1356, %none_1357 : !torch.vtensor<[1,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,?,1],f32>
torch.bind_symbolic_shape %1660, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
%float1.000000e-05_1358 = torch.constant.float 1.000000e-05
%int1_1359 = torch.constant.int 1
%1661 = torch.aten.add.Scalar %1660, %float1.000000e-05_1358, %int1_1359 : !torch.vtensor<[1,?,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,?,1],f32>
torch.bind_symbolic_shape %1661, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
%1662 = torch.aten.rsqrt %1661 : !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,1],f32>
torch.bind_symbolic_shape %1662, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
%1663 = torch.aten.mul.Tensor %1657, %1662 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %1663, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%1664 = torch.aten.mul.Tensor %86, %1663 : !torch.vtensor<[4096],bf16>, !torch.vtensor<[1,?,4096],f32> -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %1664, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%1665 = torch.aten.div.Tensor %1664, %87 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %1665, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%float-2.400000e02_1360 = torch.constant.float -2.400000e+02
%float2.400000e02_1361 = torch.constant.float 2.400000e+02
%1666 = torch.aten.clamp %1665, %float-2.400000e02_1360, %float2.400000e02_1361 : !torch.vtensor<[1,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %1666, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%int26_1362 = torch.constant.int 26
%1667 = torch.prims.convert_element_type %1666, %int26_1362 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %1667, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
%int-2_1363 = torch.constant.int -2
%int-1_1364 = torch.constant.int -1
%1668 = torch.aten.transpose.int %88, %int-2_1363, %int-1_1364 : !torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%int4096_1365 = torch.constant.int 4096
%1669 = torch.prim.ListConstruct %566, %int4096_1365 : (!torch.int, !torch.int) -> !torch.list<int>
%1670 = torch.aten.view %1667, %1669 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %1670, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
%1671 = torch.aten.mm %1670, %1668 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %1671, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
%int1_1366 = torch.constant.int 1
%int4096_1367 = torch.constant.int 4096
%1672 = torch.prim.ListConstruct %int1_1366, %566, %int4096_1367 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%1673 = torch.aten.view %1671, %1672 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %1673, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
%int15_1368 = torch.constant.int 15
%1674 = torch.prims.convert_element_type %1673, %int15_1368 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
torch.bind_symbolic_shape %1674, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
%1675 = torch.aten.div.Tensor %1664, %89 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %1675, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%float-2.400000e02_1369 = torch.constant.float -2.400000e+02
%float2.400000e02_1370 = torch.constant.float 2.400000e+02
%1676 = torch.aten.clamp %1675, %float-2.400000e02_1369, %float2.400000e02_1370 : !torch.vtensor<[1,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %1676, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%int26_1371 = torch.constant.int 26
%1677 = torch.prims.convert_element_type %1676, %int26_1371 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %1677, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
%int-2_1372 = torch.constant.int -2
%int-1_1373 = torch.constant.int -1
%1678 = torch.aten.transpose.int %90, %int-2_1372, %int-1_1373 : !torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,1024],f8E4M3FNUZ>
%int4096_1374 = torch.constant.int 4096
%1679 = torch.prim.ListConstruct %566, %int4096_1374 : (!torch.int, !torch.int) -> !torch.list<int>
%1680 = torch.aten.view %1677, %1679 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %1680, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
%1681 = torch.aten.mm %1680, %1678 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,1024],f8E4M3FNUZ> -> !torch.vtensor<[?,1024],f8E4M3FNUZ>
torch.bind_symbolic_shape %1681, [%548], affine_map<()[s0] -> (s0 * 32, 1024)> : !torch.vtensor<[?,1024],f8E4M3FNUZ>
%int1_1375 = torch.constant.int 1
%int1024_1376 = torch.constant.int 1024
%1682 = torch.prim.ListConstruct %int1_1375, %566, %int1024_1376 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%1683 = torch.aten.view %1681, %1682 : !torch.vtensor<[?,1024],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
torch.bind_symbolic_shape %1683, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
%int15_1377 = torch.constant.int 15
%1684 = torch.prims.convert_element_type %1683, %int15_1377 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,1024],bf16>
torch.bind_symbolic_shape %1684, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],bf16>
%1685 = torch.aten.div.Tensor %1664, %91 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %1685, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%float-2.400000e02_1378 = torch.constant.float -2.400000e+02
%float2.400000e02_1379 = torch.constant.float 2.400000e+02
%1686 = torch.aten.clamp %1685, %float-2.400000e02_1378, %float2.400000e02_1379 : !torch.vtensor<[1,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %1686, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%int26_1380 = torch.constant.int 26
%1687 = torch.prims.convert_element_type %1686, %int26_1380 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %1687, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
%int-2_1381 = torch.constant.int -2
%int-1_1382 = torch.constant.int -1
%1688 = torch.aten.transpose.int %92, %int-2_1381, %int-1_1382 : !torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,1024],f8E4M3FNUZ>
%int4096_1383 = torch.constant.int 4096
%1689 = torch.prim.ListConstruct %566, %int4096_1383 : (!torch.int, !torch.int) -> !torch.list<int>
%1690 = torch.aten.view %1687, %1689 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %1690, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
%1691 = torch.aten.mm %1690, %1688 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,1024],f8E4M3FNUZ> -> !torch.vtensor<[?,1024],f8E4M3FNUZ>
torch.bind_symbolic_shape %1691, [%548], affine_map<()[s0] -> (s0 * 32, 1024)> : !torch.vtensor<[?,1024],f8E4M3FNUZ>
%int1_1384 = torch.constant.int 1
%int1024_1385 = torch.constant.int 1024
%1692 = torch.prim.ListConstruct %int1_1384, %566, %int1024_1385 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%1693 = torch.aten.view %1691, %1692 : !torch.vtensor<[?,1024],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
torch.bind_symbolic_shape %1693, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
%int15_1386 = torch.constant.int 15
%1694 = torch.prims.convert_element_type %1693, %int15_1386 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,1024],bf16>
torch.bind_symbolic_shape %1694, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],bf16>
%int1_1387 = torch.constant.int 1
%int32_1388 = torch.constant.int 32
%int128_1389 = torch.constant.int 128
%1695 = torch.prim.ListConstruct %int1_1387, %566, %int32_1388, %int128_1389 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%1696 = torch.aten.view %1674, %1695 : !torch.vtensor<[1,?,4096],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,32,128],bf16>
torch.bind_symbolic_shape %1696, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
%int1_1390 = torch.constant.int 1
%int8_1391 = torch.constant.int 8
%int128_1392 = torch.constant.int 128
%1697 = torch.prim.ListConstruct %int1_1390, %566, %int8_1391, %int128_1392 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%1698 = torch.aten.view %1684, %1697 : !torch.vtensor<[1,?,1024],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,8,128],bf16>
torch.bind_symbolic_shape %1698, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
%int1_1393 = torch.constant.int 1
%int8_1394 = torch.constant.int 8
%int128_1395 = torch.constant.int 128
%1699 = torch.prim.ListConstruct %int1_1393, %566, %int8_1394, %int128_1395 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%1700 = torch.aten.view %1694, %1699 : !torch.vtensor<[1,?,1024],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,8,128],bf16>
torch.bind_symbolic_shape %1700, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
%int131072_1396 = torch.constant.int 131072
%none_1397 = torch.constant.none
%none_1398 = torch.constant.none
%cpu_1399 = torch.constant.device "cpu"
%false_1400 = torch.constant.bool false
%1701 = torch.aten.arange %int131072_1396, %none_1397, %none_1398, %cpu_1399, %false_1400 : !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[131072],si64>
%int0_1401 = torch.constant.int 0
%int128_1402 = torch.constant.int 128
%none_1403 = torch.constant.none
%none_1404 = torch.constant.none
%cpu_1405 = torch.constant.device "cpu"
%false_1406 = torch.constant.bool false
%1702 = torch.aten.arange.start %int0_1401, %int128_1402, %none_1403, %none_1404, %cpu_1405, %false_1406 : !torch.int, !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[128],si64>
%int2_1407 = torch.constant.int 2
%1703 = torch.aten.floor_divide.Scalar %1702, %int2_1407 : !torch.vtensor<[128],si64>, !torch.int -> !torch.vtensor<[128],si64>
%int6_1408 = torch.constant.int 6
%1704 = torch.prims.convert_element_type %1703, %int6_1408 : !torch.vtensor<[128],si64>, !torch.int -> !torch.vtensor<[128],f32>
%int128_1409 = torch.constant.int 128
%1705 = torch.aten.div.Scalar %1704, %int128_1409 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[128],f32>
%float2.000000e00_1410 = torch.constant.float 2.000000e+00
%1706 = torch.aten.mul.Scalar %1705, %float2.000000e00_1410 : !torch.vtensor<[128],f32>, !torch.float -> !torch.vtensor<[128],f32>
%float5.000000e05_1411 = torch.constant.float 5.000000e+05
%1707 = torch.aten.pow.Scalar %float5.000000e05_1411, %1706 : !torch.float, !torch.vtensor<[128],f32> -> !torch.vtensor<[128],f32>
%1708 = torch.aten.reciprocal %1707 : !torch.vtensor<[128],f32> -> !torch.vtensor<[128],f32>
%float1.000000e00_1412 = torch.constant.float 1.000000e+00
%1709 = torch.aten.mul.Scalar %1708, %float1.000000e00_1412 : !torch.vtensor<[128],f32>, !torch.float -> !torch.vtensor<[128],f32>
%int131072_1413 = torch.constant.int 131072
%int1_1414 = torch.constant.int 1
%1710 = torch.prim.ListConstruct %int131072_1413, %int1_1414 : (!torch.int, !torch.int) -> !torch.list<int>
%1711 = torch.aten.view %1701, %1710 : !torch.vtensor<[131072],si64>, !torch.list<int> -> !torch.vtensor<[131072,1],si64>
%1712 = torch.aten.mul.Tensor %1711, %1709 : !torch.vtensor<[131072,1],si64>, !torch.vtensor<[128],f32> -> !torch.vtensor<[131072,128],f32>
%int1_1415 = torch.constant.int 1
%1713 = torch.aten.size.int %1673, %int1_1415 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.int
%int0_1416 = torch.constant.int 0
%1714 = torch.aten.add.int %int0_1416, %1713 : !torch.int, !torch.int -> !torch.int
%int0_1417 = torch.constant.int 0
%int0_1418 = torch.constant.int 0
%int1_1419 = torch.constant.int 1
%1715 = torch.aten.slice.Tensor %1712, %int0_1417, %int0_1418, %1714, %int1_1419 : !torch.vtensor<[131072,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
torch.bind_symbolic_shape %1715, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
%int1_1420 = torch.constant.int 1
%int0_1421 = torch.constant.int 0
%int9223372036854775807_1422 = torch.constant.int 9223372036854775807
%int1_1423 = torch.constant.int 1
%1716 = torch.aten.slice.Tensor %1715, %int1_1420, %int0_1421, %int9223372036854775807_1422, %int1_1423 : !torch.vtensor<[?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
torch.bind_symbolic_shape %1716, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
%int1_1424 = torch.constant.int 1
%int0_1425 = torch.constant.int 0
%int9223372036854775807_1426 = torch.constant.int 9223372036854775807
%int1_1427 = torch.constant.int 1
%1717 = torch.aten.slice.Tensor %1716, %int1_1424, %int0_1425, %int9223372036854775807_1426, %int1_1427 : !torch.vtensor<[?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
torch.bind_symbolic_shape %1717, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
%int0_1428 = torch.constant.int 0
%1718 = torch.aten.unsqueeze %1717, %int0_1428 : !torch.vtensor<[?,128],f32>, !torch.int -> !torch.vtensor<[1,?,128],f32>
torch.bind_symbolic_shape %1718, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
%int1_1429 = torch.constant.int 1
%int0_1430 = torch.constant.int 0
%int9223372036854775807_1431 = torch.constant.int 9223372036854775807
%int1_1432 = torch.constant.int 1
%1719 = torch.aten.slice.Tensor %1718, %int1_1429, %int0_1430, %int9223372036854775807_1431, %int1_1432 : !torch.vtensor<[1,?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],f32>
torch.bind_symbolic_shape %1719, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
%int2_1433 = torch.constant.int 2
%int0_1434 = torch.constant.int 0
%int9223372036854775807_1435 = torch.constant.int 9223372036854775807
%int1_1436 = torch.constant.int 1
%1720 = torch.aten.slice.Tensor %1719, %int2_1433, %int0_1434, %int9223372036854775807_1435, %int1_1436 : !torch.vtensor<[1,?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],f32>
torch.bind_symbolic_shape %1720, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
%int1_1437 = torch.constant.int 1
%int1_1438 = torch.constant.int 1
%int1_1439 = torch.constant.int 1
%1721 = torch.prim.ListConstruct %int1_1437, %int1_1438, %int1_1439 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%1722 = torch.aten.repeat %1720, %1721 : !torch.vtensor<[1,?,128],f32>, !torch.list<int> -> !torch.vtensor<[1,?,128],f32>
torch.bind_symbolic_shape %1722, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
%int6_1440 = torch.constant.int 6
%1723 = torch.prims.convert_element_type %1696, %int6_1440 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int -> !torch.vtensor<[1,?,32,128],f32>
torch.bind_symbolic_shape %1723, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
%1724 = torch_c.to_builtin_tensor %1723 : !torch.vtensor<[1,?,32,128],f32> -> tensor<1x?x32x128xf32>
%1725 = torch_c.to_builtin_tensor %1722 : !torch.vtensor<[1,?,128],f32> -> tensor<1x?x128xf32>
%1726 = util.call @sharktank_rotary_embedding_1_D_32_128_f32(%1724, %1725) : (tensor<1x?x32x128xf32>, tensor<1x?x128xf32>) -> tensor<1x?x32x128xf32>
%1727 = torch_c.from_builtin_tensor %1726 : tensor<1x?x32x128xf32> -> !torch.vtensor<[1,?,32,128],f32>
torch.bind_symbolic_shape %1727, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
%int15_1441 = torch.constant.int 15
%1728 = torch.prims.convert_element_type %1727, %int15_1441 : !torch.vtensor<[1,?,32,128],f32>, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
torch.bind_symbolic_shape %1728, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
%int131072_1442 = torch.constant.int 131072
%none_1443 = torch.constant.none
%none_1444 = torch.constant.none
%cpu_1445 = torch.constant.device "cpu"
%false_1446 = torch.constant.bool false
%1729 = torch.aten.arange %int131072_1442, %none_1443, %none_1444, %cpu_1445, %false_1446 : !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[131072],si64>
%int0_1447 = torch.constant.int 0
%int128_1448 = torch.constant.int 128
%none_1449 = torch.constant.none
%none_1450 = torch.constant.none
%cpu_1451 = torch.constant.device "cpu"
%false_1452 = torch.constant.bool false
%1730 = torch.aten.arange.start %int0_1447, %int128_1448, %none_1449, %none_1450, %cpu_1451, %false_1452 : !torch.int, !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[128],si64>
%int2_1453 = torch.constant.int 2
%1731 = torch.aten.floor_divide.Scalar %1730, %int2_1453 : !torch.vtensor<[128],si64>, !torch.int -> !torch.vtensor<[128],si64>
%int6_1454 = torch.constant.int 6
%1732 = torch.prims.convert_element_type %1731, %int6_1454 : !torch.vtensor<[128],si64>, !torch.int -> !torch.vtensor<[128],f32>
%int128_1455 = torch.constant.int 128
%1733 = torch.aten.div.Scalar %1732, %int128_1455 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[128],f32>
%float2.000000e00_1456 = torch.constant.float 2.000000e+00
%1734 = torch.aten.mul.Scalar %1733, %float2.000000e00_1456 : !torch.vtensor<[128],f32>, !torch.float -> !torch.vtensor<[128],f32>
%float5.000000e05_1457 = torch.constant.float 5.000000e+05
%1735 = torch.aten.pow.Scalar %float5.000000e05_1457, %1734 : !torch.float, !torch.vtensor<[128],f32> -> !torch.vtensor<[128],f32>
%1736 = torch.aten.reciprocal %1735 : !torch.vtensor<[128],f32> -> !torch.vtensor<[128],f32>
%float1.000000e00_1458 = torch.constant.float 1.000000e+00
%1737 = torch.aten.mul.Scalar %1736, %float1.000000e00_1458 : !torch.vtensor<[128],f32>, !torch.float -> !torch.vtensor<[128],f32>
%int131072_1459 = torch.constant.int 131072
%int1_1460 = torch.constant.int 1
%1738 = torch.prim.ListConstruct %int131072_1459, %int1_1460 : (!torch.int, !torch.int) -> !torch.list<int>
%1739 = torch.aten.view %1729, %1738 : !torch.vtensor<[131072],si64>, !torch.list<int> -> !torch.vtensor<[131072,1],si64>
%1740 = torch.aten.mul.Tensor %1739, %1737 : !torch.vtensor<[131072,1],si64>, !torch.vtensor<[128],f32> -> !torch.vtensor<[131072,128],f32>
%int1_1461 = torch.constant.int 1
%1741 = torch.aten.size.int %1683, %int1_1461 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.int
%int0_1462 = torch.constant.int 0
%1742 = torch.aten.add.int %int0_1462, %1741 : !torch.int, !torch.int -> !torch.int
%int0_1463 = torch.constant.int 0
%int0_1464 = torch.constant.int 0
%int1_1465 = torch.constant.int 1
%1743 = torch.aten.slice.Tensor %1740, %int0_1463, %int0_1464, %1742, %int1_1465 : !torch.vtensor<[131072,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
torch.bind_symbolic_shape %1743, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
%int1_1466 = torch.constant.int 1
%int0_1467 = torch.constant.int 0
%int9223372036854775807_1468 = torch.constant.int 9223372036854775807
%int1_1469 = torch.constant.int 1
%1744 = torch.aten.slice.Tensor %1743, %int1_1466, %int0_1467, %int9223372036854775807_1468, %int1_1469 : !torch.vtensor<[?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
torch.bind_symbolic_shape %1744, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
%int1_1470 = torch.constant.int 1
%int0_1471 = torch.constant.int 0
%int9223372036854775807_1472 = torch.constant.int 9223372036854775807
%int1_1473 = torch.constant.int 1
%1745 = torch.aten.slice.Tensor %1744, %int1_1470, %int0_1471, %int9223372036854775807_1472, %int1_1473 : !torch.vtensor<[?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
torch.bind_symbolic_shape %1745, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
%int0_1474 = torch.constant.int 0
%1746 = torch.aten.unsqueeze %1745, %int0_1474 : !torch.vtensor<[?,128],f32>, !torch.int -> !torch.vtensor<[1,?,128],f32>
torch.bind_symbolic_shape %1746, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
%int1_1475 = torch.constant.int 1
%int0_1476 = torch.constant.int 0
%int9223372036854775807_1477 = torch.constant.int 9223372036854775807
%int1_1478 = torch.constant.int 1
%1747 = torch.aten.slice.Tensor %1746, %int1_1475, %int0_1476, %int9223372036854775807_1477, %int1_1478 : !torch.vtensor<[1,?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],f32>
torch.bind_symbolic_shape %1747, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
%int2_1479 = torch.constant.int 2
%int0_1480 = torch.constant.int 0
%int9223372036854775807_1481 = torch.constant.int 9223372036854775807
%int1_1482 = torch.constant.int 1
%1748 = torch.aten.slice.Tensor %1747, %int2_1479, %int0_1480, %int9223372036854775807_1481, %int1_1482 : !torch.vtensor<[1,?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],f32>
torch.bind_symbolic_shape %1748, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
%int1_1483 = torch.constant.int 1
%int1_1484 = torch.constant.int 1
%int1_1485 = torch.constant.int 1
%1749 = torch.prim.ListConstruct %int1_1483, %int1_1484, %int1_1485 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%1750 = torch.aten.repeat %1748, %1749 : !torch.vtensor<[1,?,128],f32>, !torch.list<int> -> !torch.vtensor<[1,?,128],f32>
torch.bind_symbolic_shape %1750, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
%int6_1486 = torch.constant.int 6
%1751 = torch.prims.convert_element_type %1698, %int6_1486 : !torch.vtensor<[1,?,8,128],bf16>, !torch.int -> !torch.vtensor<[1,?,8,128],f32>
torch.bind_symbolic_shape %1751, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f32>
%1752 = torch_c.to_builtin_tensor %1751 : !torch.vtensor<[1,?,8,128],f32> -> tensor<1x?x8x128xf32>
%1753 = torch_c.to_builtin_tensor %1750 : !torch.vtensor<[1,?,128],f32> -> tensor<1x?x128xf32>
%1754 = util.call @sharktank_rotary_embedding_1_D_8_128_f32(%1752, %1753) : (tensor<1x?x8x128xf32>, tensor<1x?x128xf32>) -> tensor<1x?x8x128xf32>
%1755 = torch_c.from_builtin_tensor %1754 : tensor<1x?x8x128xf32> -> !torch.vtensor<[1,?,8,128],f32>
torch.bind_symbolic_shape %1755, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f32>
%int15_1487 = torch.constant.int 15
%1756 = torch.prims.convert_element_type %1755, %int15_1487 : !torch.vtensor<[1,?,8,128],f32>, !torch.int -> !torch.vtensor<[1,?,8,128],bf16>
torch.bind_symbolic_shape %1756, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
%1757 = torch.aten.div.Tensor %1756, %93 : !torch.vtensor<[1,?,8,128],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,8,128],bf16>
torch.bind_symbolic_shape %1757, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
%float-2.400000e02_1488 = torch.constant.float -2.400000e+02
%float2.400000e02_1489 = torch.constant.float 2.400000e+02
%1758 = torch.aten.clamp %1757, %float-2.400000e02_1488, %float2.400000e02_1489 : !torch.vtensor<[1,?,8,128],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,8,128],bf16>
torch.bind_symbolic_shape %1758, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
%int26_1490 = torch.constant.int 26
%1759 = torch.prims.convert_element_type %1758, %int26_1490 : !torch.vtensor<[1,?,8,128],bf16>, !torch.int -> !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %1759, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
%1760 = torch.aten.div.Tensor %1700, %93 : !torch.vtensor<[1,?,8,128],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,8,128],bf16>
torch.bind_symbolic_shape %1760, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
%float-2.400000e02_1491 = torch.constant.float -2.400000e+02
%float2.400000e02_1492 = torch.constant.float 2.400000e+02
%1761 = torch.aten.clamp %1760, %float-2.400000e02_1491, %float2.400000e02_1492 : !torch.vtensor<[1,?,8,128],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,8,128],bf16>
torch.bind_symbolic_shape %1761, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
%int26_1493 = torch.constant.int 26
%1762 = torch.prims.convert_element_type %1761, %int26_1493 : !torch.vtensor<[1,?,8,128],bf16>, !torch.int -> !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %1762, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
%int64_1494 = torch.constant.int 64
%1763 = torch.aten.mul.Scalar %arg2, %int64_1494 : !torch.vtensor<[1,?],si64>, !torch.int -> !torch.vtensor<[1,?],si64>
torch.bind_symbolic_shape %1763, [%548], affine_map<()[s0] -> (1, s0)> : !torch.vtensor<[1,?],si64>
%int10 = torch.constant.int 10
%int1_1495 = torch.constant.int 1
%1764 = torch.aten.add.Scalar %1763, %int10, %int1_1495 : !torch.vtensor<[1,?],si64>, !torch.int, !torch.int -> !torch.vtensor<[1,?],si64>
torch.bind_symbolic_shape %1764, [%548], affine_map<()[s0] -> (1, s0)> : !torch.vtensor<[1,?],si64>
%int1_1496 = torch.constant.int 1
%int32_1497 = torch.constant.int 32
%int8_1498 = torch.constant.int 8
%int128_1499 = torch.constant.int 128
%1765 = torch.prim.ListConstruct %int1_1496, %670, %int32_1497, %int8_1498, %int128_1499 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%1766 = torch.aten.view %1759, %1765 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %1766, [%548], affine_map<()[s0] -> (1, s0, 32, 8, 128)> : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
%int32_1500 = torch.constant.int 32
%int8_1501 = torch.constant.int 8
%int128_1502 = torch.constant.int 128
%1767 = torch.prim.ListConstruct %670, %int32_1500, %int8_1501, %int128_1502 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%1768 = torch.aten.view %1766, %1767 : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %1768, [%548], affine_map<()[s0] -> (s0, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
%1769 = torch.prim.ListConstruct %670 : (!torch.int) -> !torch.list<int>
%1770 = torch.aten.view %1764, %1769 : !torch.vtensor<[1,?],si64>, !torch.list<int> -> !torch.vtensor<[?],si64>
torch.bind_symbolic_shape %1770, [%548], affine_map<()[s0] -> (s0)> : !torch.vtensor<[?],si64>
%int32_1503 = torch.constant.int 32
%int2_1504 = torch.constant.int 2
%int32_1505 = torch.constant.int 32
%int8_1506 = torch.constant.int 8
%int128_1507 = torch.constant.int 128
%1771 = torch.prim.ListConstruct %661, %int32_1503, %int2_1504, %int32_1505, %int8_1506, %int128_1507 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%1772 = torch.aten.view %1579, %1771 : !torch.vtensor<[?,2097152],f16>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f16>
torch.bind_symbolic_shape %1772, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f16>
%int32_1508 = torch.constant.int 32
%1773 = torch.aten.mul.int %661, %int32_1508 : !torch.int, !torch.int -> !torch.int
%int2_1509 = torch.constant.int 2
%1774 = torch.aten.mul.int %1773, %int2_1509 : !torch.int, !torch.int -> !torch.int
%int32_1510 = torch.constant.int 32
%int8_1511 = torch.constant.int 8
%int128_1512 = torch.constant.int 128
%1775 = torch.prim.ListConstruct %1774, %int32_1510, %int8_1511, %int128_1512 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%1776 = torch.aten.view %1772, %1775 : !torch.vtensor<[?,32,2,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f16>
torch.bind_symbolic_shape %1776, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f16>
%1777 = torch.prim.ListConstruct %1770 : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
%false_1513 = torch.constant.bool false
%1778 = torch.aten.index_put %1776, %1777, %1768, %false_1513 : !torch.vtensor<[?,32,8,128],f16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool -> !torch.vtensor<[?,32,8,128],f16>
torch.bind_symbolic_shape %1778, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f16>
%int32_1514 = torch.constant.int 32
%int2_1515 = torch.constant.int 2
%int32_1516 = torch.constant.int 32
%int8_1517 = torch.constant.int 8
%int128_1518 = torch.constant.int 128
%1779 = torch.prim.ListConstruct %661, %int32_1514, %int2_1515, %int32_1516, %int8_1517, %int128_1518 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%1780 = torch.aten.view %1778, %1779 : !torch.vtensor<[?,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f16>
torch.bind_symbolic_shape %1780, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f16>
%int2097152_1519 = torch.constant.int 2097152
%1781 = torch.prim.ListConstruct %661, %int2097152_1519 : (!torch.int, !torch.int) -> !torch.list<int>
%1782 = torch.aten.view %1780, %1781 : !torch.vtensor<[?,32,2,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,2097152],f16>
torch.bind_symbolic_shape %1782, [%549], affine_map<()[s0] -> (s0, 2097152)> : !torch.vtensor<[?,2097152],f16>
%int32_1520 = torch.constant.int 32
%int2_1521 = torch.constant.int 2
%int32_1522 = torch.constant.int 32
%int8_1523 = torch.constant.int 8
%int128_1524 = torch.constant.int 128
%1783 = torch.prim.ListConstruct %661, %int32_1520, %int2_1521, %int32_1522, %int8_1523, %int128_1524 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%1784 = torch.aten.view %1782, %1783 : !torch.vtensor<[?,2097152],f16>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f16>
torch.bind_symbolic_shape %1784, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f16>
%int32_1525 = torch.constant.int 32
%int8_1526 = torch.constant.int 8
%int128_1527 = torch.constant.int 128
%1785 = torch.prim.ListConstruct %1774, %int32_1525, %int8_1526, %int128_1527 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%1786 = torch.aten.view %1784, %1785 : !torch.vtensor<[?,32,2,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f16>
torch.bind_symbolic_shape %1786, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f16>
%int1_1528 = torch.constant.int 1
%int32_1529 = torch.constant.int 32
%int8_1530 = torch.constant.int 8
%int128_1531 = torch.constant.int 128
%1787 = torch.prim.ListConstruct %int1_1528, %670, %int32_1529, %int8_1530, %int128_1531 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%1788 = torch.aten.view %1762, %1787 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %1788, [%548], affine_map<()[s0] -> (1, s0, 32, 8, 128)> : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
%int32_1532 = torch.constant.int 32
%int8_1533 = torch.constant.int 8
%int128_1534 = torch.constant.int 128
%1789 = torch.prim.ListConstruct %670, %int32_1532, %int8_1533, %int128_1534 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%1790 = torch.aten.view %1788, %1789 : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %1790, [%548], affine_map<()[s0] -> (s0, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
%int1_1535 = torch.constant.int 1
%int1_1536 = torch.constant.int 1
%1791 = torch.aten.add.Scalar %1764, %int1_1535, %int1_1536 : !torch.vtensor<[1,?],si64>, !torch.int, !torch.int -> !torch.vtensor<[1,?],si64>
torch.bind_symbolic_shape %1791, [%548], affine_map<()[s0] -> (1, s0)> : !torch.vtensor<[1,?],si64>
%1792 = torch.prim.ListConstruct %670 : (!torch.int) -> !torch.list<int>
%1793 = torch.aten.view %1791, %1792 : !torch.vtensor<[1,?],si64>, !torch.list<int> -> !torch.vtensor<[?],si64>
torch.bind_symbolic_shape %1793, [%548], affine_map<()[s0] -> (s0)> : !torch.vtensor<[?],si64>
%1794 = torch.prim.ListConstruct %1793 : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
%false_1537 = torch.constant.bool false
%1795 = torch.aten.index_put %1786, %1794, %1790, %false_1537 : !torch.vtensor<[?,32,8,128],f16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool -> !torch.vtensor<[?,32,8,128],f16>
torch.bind_symbolic_shape %1795, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f16>
%int32_1538 = torch.constant.int 32
%int2_1539 = torch.constant.int 2
%int32_1540 = torch.constant.int 32
%int8_1541 = torch.constant.int 8
%int128_1542 = torch.constant.int 128
%1796 = torch.prim.ListConstruct %661, %int32_1538, %int2_1539, %int32_1540, %int8_1541, %int128_1542 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%1797 = torch.aten.view %1795, %1796 : !torch.vtensor<[?,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f16>
torch.bind_symbolic_shape %1797, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f16>
%int2097152_1543 = torch.constant.int 2097152
%1798 = torch.prim.ListConstruct %661, %int2097152_1543 : (!torch.int, !torch.int) -> !torch.list<int>
%1799 = torch.aten.view %1797, %1798 : !torch.vtensor<[?,32,2,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,2097152],f16>
torch.bind_symbolic_shape %1799, [%549], affine_map<()[s0] -> (s0, 2097152)> : !torch.vtensor<[?,2097152],f16>
%int-2_1544 = torch.constant.int -2
%1800 = torch.aten.unsqueeze %1759, %int-2_1544 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %1800, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 1, 128)> : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
%int1_1545 = torch.constant.int 1
%int8_1546 = torch.constant.int 8
%int4_1547 = torch.constant.int 4
%int128_1548 = torch.constant.int 128
%1801 = torch.prim.ListConstruct %int1_1545, %1741, %int8_1546, %int4_1547, %int128_1548 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%false_1549 = torch.constant.bool false
%1802 = torch.aten.expand %1800, %1801, %false_1549 : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %1802, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
%int0_1550 = torch.constant.int 0
%1803 = torch.aten.clone %1802, %int0_1550 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %1803, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
%int1_1551 = torch.constant.int 1
%int32_1552 = torch.constant.int 32
%int128_1553 = torch.constant.int 128
%1804 = torch.prim.ListConstruct %int1_1551, %1741, %int32_1552, %int128_1553 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%1805 = torch.aten._unsafe_view %1803, %1804 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %1805, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
%int-2_1554 = torch.constant.int -2
%1806 = torch.aten.unsqueeze %1762, %int-2_1554 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %1806, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 1, 128)> : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
%int1_1555 = torch.constant.int 1
%1807 = torch.aten.size.int %1693, %int1_1555 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.int
%int1_1556 = torch.constant.int 1
%int8_1557 = torch.constant.int 8
%int4_1558 = torch.constant.int 4
%int128_1559 = torch.constant.int 128
%1808 = torch.prim.ListConstruct %int1_1556, %1807, %int8_1557, %int4_1558, %int128_1559 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%false_1560 = torch.constant.bool false
%1809 = torch.aten.expand %1806, %1808, %false_1560 : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %1809, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
%int0_1561 = torch.constant.int 0
%1810 = torch.aten.clone %1809, %int0_1561 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %1810, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
%int1_1562 = torch.constant.int 1
%int32_1563 = torch.constant.int 32
%int128_1564 = torch.constant.int 128
%1811 = torch.prim.ListConstruct %int1_1562, %1807, %int32_1563, %int128_1564 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%1812 = torch.aten._unsafe_view %1810, %1811 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %1812, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
%int6_1565 = torch.constant.int 6
%1813 = torch.prims.convert_element_type %1805, %int6_1565 : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,32,128],f32>
torch.bind_symbolic_shape %1813, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
%1814 = torch.aten.mul.Tensor %1813, %93 : !torch.vtensor<[1,?,32,128],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,32,128],f32>
torch.bind_symbolic_shape %1814, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
%int15_1566 = torch.constant.int 15
%1815 = torch.prims.convert_element_type %1814, %int15_1566 : !torch.vtensor<[1,?,32,128],f32>, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
torch.bind_symbolic_shape %1815, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
%int6_1567 = torch.constant.int 6
%1816 = torch.prims.convert_element_type %1812, %int6_1567 : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,32,128],f32>
torch.bind_symbolic_shape %1816, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
%1817 = torch.aten.mul.Tensor %1816, %93 : !torch.vtensor<[1,?,32,128],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,32,128],f32>
torch.bind_symbolic_shape %1817, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
%int15_1568 = torch.constant.int 15
%1818 = torch.prims.convert_element_type %1817, %int15_1568 : !torch.vtensor<[1,?,32,128],f32>, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
torch.bind_symbolic_shape %1818, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
%int1_1569 = torch.constant.int 1
%int2_1570 = torch.constant.int 2
%1819 = torch.aten.transpose.int %1728, %int1_1569, %int2_1570 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
torch.bind_symbolic_shape %1819, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
%int1_1571 = torch.constant.int 1
%int2_1572 = torch.constant.int 2
%1820 = torch.aten.transpose.int %1815, %int1_1571, %int2_1572 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
torch.bind_symbolic_shape %1820, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
%int1_1573 = torch.constant.int 1
%int2_1574 = torch.constant.int 2
%1821 = torch.aten.transpose.int %1818, %int1_1573, %int2_1574 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
torch.bind_symbolic_shape %1821, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
%float0.000000e00_1575 = torch.constant.float 0.000000e+00
%true_1576 = torch.constant.bool true
%none_1577 = torch.constant.none
%none_1578 = torch.constant.none
%1822:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%1819, %1820, %1821, %float0.000000e00_1575, %true_1576, %none_1577, %none_1578) : (!torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,32,?,128],bf16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,32,?],f32>)
torch.bind_symbolic_shape %1822#0, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
%int1_1579 = torch.constant.int 1
%int2_1580 = torch.constant.int 2
%1823 = torch.aten.transpose.int %1822#0, %int1_1579, %int2_1580 : !torch.vtensor<[1,32,?,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
torch.bind_symbolic_shape %1823, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
%int1_1581 = torch.constant.int 1
%int4096_1582 = torch.constant.int 4096
%1824 = torch.prim.ListConstruct %int1_1581, %1713, %int4096_1582 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%1825 = torch.aten.view %1823, %1824 : !torch.vtensor<[1,?,32,128],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,4096],bf16>
torch.bind_symbolic_shape %1825, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
%1826 = torch.aten.div.Tensor %1825, %94 : !torch.vtensor<[1,?,4096],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],bf16>
torch.bind_symbolic_shape %1826, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
%float-2.400000e02_1583 = torch.constant.float -2.400000e+02
%float2.400000e02_1584 = torch.constant.float 2.400000e+02
%1827 = torch.aten.clamp %1826, %float-2.400000e02_1583, %float2.400000e02_1584 : !torch.vtensor<[1,?,4096],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],bf16>
torch.bind_symbolic_shape %1827, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
%int26_1585 = torch.constant.int 26
%1828 = torch.prims.convert_element_type %1827, %int26_1585 : !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %1828, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
%int-2_1586 = torch.constant.int -2
%int-1_1587 = torch.constant.int -1
%1829 = torch.aten.transpose.int %95, %int-2_1586, %int-1_1587 : !torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%int4096_1588 = torch.constant.int 4096
%1830 = torch.prim.ListConstruct %1713, %int4096_1588 : (!torch.int, !torch.int) -> !torch.list<int>
%1831 = torch.aten.view %1828, %1830 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %1831, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
%1832 = torch.aten.mm %1831, %1829 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %1832, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
%int1_1589 = torch.constant.int 1
%int4096_1590 = torch.constant.int 4096
%1833 = torch.prim.ListConstruct %int1_1589, %1713, %int4096_1590 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%1834 = torch.aten.view %1832, %1833 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %1834, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
%int15_1591 = torch.constant.int 15
%1835 = torch.prims.convert_element_type %1834, %int15_1591 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
torch.bind_symbolic_shape %1835, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
%int1_1592 = torch.constant.int 1
%1836 = torch.aten.add.Tensor %1657, %1835, %int1_1592 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %1836, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%int2_1593 = torch.constant.int 2
%1837 = torch.aten.pow.Tensor_Scalar %1836, %int2_1593 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %1837, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%int-1_1594 = torch.constant.int -1
%1838 = torch.prim.ListConstruct %int-1_1594 : (!torch.int) -> !torch.list<int>
%true_1595 = torch.constant.bool true
%none_1596 = torch.constant.none
%1839 = torch.aten.mean.dim %1837, %1838, %true_1595, %none_1596 : !torch.vtensor<[1,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,?,1],f32>
torch.bind_symbolic_shape %1839, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
%float1.000000e-05_1597 = torch.constant.float 1.000000e-05
%int1_1598 = torch.constant.int 1
%1840 = torch.aten.add.Scalar %1839, %float1.000000e-05_1597, %int1_1598 : !torch.vtensor<[1,?,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,?,1],f32>
torch.bind_symbolic_shape %1840, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
%1841 = torch.aten.rsqrt %1840 : !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,1],f32>
torch.bind_symbolic_shape %1841, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
%1842 = torch.aten.mul.Tensor %1836, %1841 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %1842, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%1843 = torch.aten.mul.Tensor %96, %1842 : !torch.vtensor<[4096],bf16>, !torch.vtensor<[1,?,4096],f32> -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %1843, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%1844 = torch.aten.div.Tensor %1843, %97 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %1844, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%float-2.400000e02_1599 = torch.constant.float -2.400000e+02
%float2.400000e02_1600 = torch.constant.float 2.400000e+02
%1845 = torch.aten.clamp %1844, %float-2.400000e02_1599, %float2.400000e02_1600 : !torch.vtensor<[1,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %1845, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%int26_1601 = torch.constant.int 26
%1846 = torch.prims.convert_element_type %1845, %int26_1601 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %1846, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
%int-2_1602 = torch.constant.int -2
%int-1_1603 = torch.constant.int -1
%1847 = torch.aten.transpose.int %98, %int-2_1602, %int-1_1603 : !torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
%int4096_1604 = torch.constant.int 4096
%1848 = torch.prim.ListConstruct %566, %int4096_1604 : (!torch.int, !torch.int) -> !torch.list<int>
%1849 = torch.aten.view %1846, %1848 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %1849, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
%1850 = torch.aten.mm %1849, %1847 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,14336],f8E4M3FNUZ> -> !torch.vtensor<[?,14336],f8E4M3FNUZ>
torch.bind_symbolic_shape %1850, [%548], affine_map<()[s0] -> (s0 * 32, 14336)> : !torch.vtensor<[?,14336],f8E4M3FNUZ>
%int1_1605 = torch.constant.int 1
%int14336_1606 = torch.constant.int 14336
%1851 = torch.prim.ListConstruct %int1_1605, %566, %int14336_1606 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%1852 = torch.aten.view %1850, %1851 : !torch.vtensor<[?,14336],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
torch.bind_symbolic_shape %1852, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
%int15_1607 = torch.constant.int 15
%1853 = torch.prims.convert_element_type %1852, %int15_1607 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,14336],bf16>
torch.bind_symbolic_shape %1853, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
%1854 = torch.aten.silu %1853 : !torch.vtensor<[1,?,14336],bf16> -> !torch.vtensor<[1,?,14336],bf16>
torch.bind_symbolic_shape %1854, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
%1855 = torch.aten.div.Tensor %1843, %99 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %1855, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%float-2.400000e02_1608 = torch.constant.float -2.400000e+02
%float2.400000e02_1609 = torch.constant.float 2.400000e+02
%1856 = torch.aten.clamp %1855, %float-2.400000e02_1608, %float2.400000e02_1609 : !torch.vtensor<[1,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %1856, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%int26_1610 = torch.constant.int 26
%1857 = torch.prims.convert_element_type %1856, %int26_1610 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %1857, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
%int-2_1611 = torch.constant.int -2
%int-1_1612 = torch.constant.int -1
%1858 = torch.aten.transpose.int %100, %int-2_1611, %int-1_1612 : !torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
%int4096_1613 = torch.constant.int 4096
%1859 = torch.prim.ListConstruct %566, %int4096_1613 : (!torch.int, !torch.int) -> !torch.list<int>
%1860 = torch.aten.view %1857, %1859 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %1860, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
%1861 = torch.aten.mm %1860, %1858 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,14336],f8E4M3FNUZ> -> !torch.vtensor<[?,14336],f8E4M3FNUZ>
torch.bind_symbolic_shape %1861, [%548], affine_map<()[s0] -> (s0 * 32, 14336)> : !torch.vtensor<[?,14336],f8E4M3FNUZ>
%int1_1614 = torch.constant.int 1
%int14336_1615 = torch.constant.int 14336
%1862 = torch.prim.ListConstruct %int1_1614, %566, %int14336_1615 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%1863 = torch.aten.view %1861, %1862 : !torch.vtensor<[?,14336],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
torch.bind_symbolic_shape %1863, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
%int15_1616 = torch.constant.int 15
%1864 = torch.prims.convert_element_type %1863, %int15_1616 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,14336],bf16>
torch.bind_symbolic_shape %1864, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
%1865 = torch.aten.mul.Tensor %1854, %1864 : !torch.vtensor<[1,?,14336],bf16>, !torch.vtensor<[1,?,14336],bf16> -> !torch.vtensor<[1,?,14336],bf16>
torch.bind_symbolic_shape %1865, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
%1866 = torch.aten.div.Tensor %1865, %101 : !torch.vtensor<[1,?,14336],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,14336],bf16>
torch.bind_symbolic_shape %1866, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
%float-2.400000e02_1617 = torch.constant.float -2.400000e+02
%float2.400000e02_1618 = torch.constant.float 2.400000e+02
%1867 = torch.aten.clamp %1866, %float-2.400000e02_1617, %float2.400000e02_1618 : !torch.vtensor<[1,?,14336],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,14336],bf16>
torch.bind_symbolic_shape %1867, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
%int26_1619 = torch.constant.int 26
%1868 = torch.prims.convert_element_type %1867, %int26_1619 : !torch.vtensor<[1,?,14336],bf16>, !torch.int -> !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
torch.bind_symbolic_shape %1868, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
%int-2_1620 = torch.constant.int -2
%int-1_1621 = torch.constant.int -1
%1869 = torch.aten.transpose.int %102, %int-2_1620, %int-1_1621 : !torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%int1_1622 = torch.constant.int 1
%1870 = torch.aten.size.int %1852, %int1_1622 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.int -> !torch.int
%int14336_1623 = torch.constant.int 14336
%1871 = torch.prim.ListConstruct %1870, %int14336_1623 : (!torch.int, !torch.int) -> !torch.list<int>
%1872 = torch.aten.view %1868, %1871 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,14336],f8E4M3FNUZ>
torch.bind_symbolic_shape %1872, [%548], affine_map<()[s0] -> (s0 * 32, 14336)> : !torch.vtensor<[?,14336],f8E4M3FNUZ>
%1873 = torch.aten.mm %1872, %1869 : !torch.vtensor<[?,14336],f8E4M3FNUZ>, !torch.vtensor<[14336,4096],f8E4M3FNUZ> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %1873, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
%int1_1624 = torch.constant.int 1
%int4096_1625 = torch.constant.int 4096
%1874 = torch.prim.ListConstruct %int1_1624, %1870, %int4096_1625 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%1875 = torch.aten.view %1873, %1874 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %1875, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
%int15_1626 = torch.constant.int 15
%1876 = torch.prims.convert_element_type %1875, %int15_1626 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
torch.bind_symbolic_shape %1876, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
%int1_1627 = torch.constant.int 1
%1877 = torch.aten.add.Tensor %1836, %1876, %int1_1627 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %1877, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%int2_1628 = torch.constant.int 2
%1878 = torch.aten.pow.Tensor_Scalar %1877, %int2_1628 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %1878, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%int-1_1629 = torch.constant.int -1
%1879 = torch.prim.ListConstruct %int-1_1629 : (!torch.int) -> !torch.list<int>
%true_1630 = torch.constant.bool true
%none_1631 = torch.constant.none
%1880 = torch.aten.mean.dim %1878, %1879, %true_1630, %none_1631 : !torch.vtensor<[1,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,?,1],f32>
torch.bind_symbolic_shape %1880, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
%float1.000000e-05_1632 = torch.constant.float 1.000000e-05
%int1_1633 = torch.constant.int 1
%1881 = torch.aten.add.Scalar %1880, %float1.000000e-05_1632, %int1_1633 : !torch.vtensor<[1,?,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,?,1],f32>
torch.bind_symbolic_shape %1881, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
%1882 = torch.aten.rsqrt %1881 : !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,1],f32>
torch.bind_symbolic_shape %1882, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
%1883 = torch.aten.mul.Tensor %1877, %1882 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %1883, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%1884 = torch.aten.mul.Tensor %103, %1883 : !torch.vtensor<[4096],bf16>, !torch.vtensor<[1,?,4096],f32> -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %1884, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%1885 = torch.aten.div.Tensor %1884, %104 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %1885, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%float-2.400000e02_1634 = torch.constant.float -2.400000e+02
%float2.400000e02_1635 = torch.constant.float 2.400000e+02
%1886 = torch.aten.clamp %1885, %float-2.400000e02_1634, %float2.400000e02_1635 : !torch.vtensor<[1,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %1886, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%int26_1636 = torch.constant.int 26
%1887 = torch.prims.convert_element_type %1886, %int26_1636 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %1887, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
%int-2_1637 = torch.constant.int -2
%int-1_1638 = torch.constant.int -1
%1888 = torch.aten.transpose.int %105, %int-2_1637, %int-1_1638 : !torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%int4096_1639 = torch.constant.int 4096
%1889 = torch.prim.ListConstruct %566, %int4096_1639 : (!torch.int, !torch.int) -> !torch.list<int>
%1890 = torch.aten.view %1887, %1889 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %1890, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
%1891 = torch.aten.mm %1890, %1888 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %1891, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
%int1_1640 = torch.constant.int 1
%int4096_1641 = torch.constant.int 4096
%1892 = torch.prim.ListConstruct %int1_1640, %566, %int4096_1641 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%1893 = torch.aten.view %1891, %1892 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %1893, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
%int15_1642 = torch.constant.int 15
%1894 = torch.prims.convert_element_type %1893, %int15_1642 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
torch.bind_symbolic_shape %1894, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
%1895 = torch.aten.div.Tensor %1884, %106 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %1895, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%float-2.400000e02_1643 = torch.constant.float -2.400000e+02
%float2.400000e02_1644 = torch.constant.float 2.400000e+02
%1896 = torch.aten.clamp %1895, %float-2.400000e02_1643, %float2.400000e02_1644 : !torch.vtensor<[1,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %1896, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%int26_1645 = torch.constant.int 26
%1897 = torch.prims.convert_element_type %1896, %int26_1645 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %1897, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
%int-2_1646 = torch.constant.int -2
%int-1_1647 = torch.constant.int -1
%1898 = torch.aten.transpose.int %107, %int-2_1646, %int-1_1647 : !torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,1024],f8E4M3FNUZ>
%int4096_1648 = torch.constant.int 4096
%1899 = torch.prim.ListConstruct %566, %int4096_1648 : (!torch.int, !torch.int) -> !torch.list<int>
%1900 = torch.aten.view %1897, %1899 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %1900, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
%1901 = torch.aten.mm %1900, %1898 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,1024],f8E4M3FNUZ> -> !torch.vtensor<[?,1024],f8E4M3FNUZ>
torch.bind_symbolic_shape %1901, [%548], affine_map<()[s0] -> (s0 * 32, 1024)> : !torch.vtensor<[?,1024],f8E4M3FNUZ>
%int1_1649 = torch.constant.int 1
%int1024_1650 = torch.constant.int 1024
%1902 = torch.prim.ListConstruct %int1_1649, %566, %int1024_1650 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%1903 = torch.aten.view %1901, %1902 : !torch.vtensor<[?,1024],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
torch.bind_symbolic_shape %1903, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
%int15_1651 = torch.constant.int 15
%1904 = torch.prims.convert_element_type %1903, %int15_1651 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,1024],bf16>
torch.bind_symbolic_shape %1904, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],bf16>
%1905 = torch.aten.div.Tensor %1884, %108 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %1905, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%float-2.400000e02_1652 = torch.constant.float -2.400000e+02
%float2.400000e02_1653 = torch.constant.float 2.400000e+02
%1906 = torch.aten.clamp %1905, %float-2.400000e02_1652, %float2.400000e02_1653 : !torch.vtensor<[1,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %1906, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%int26_1654 = torch.constant.int 26
%1907 = torch.prims.convert_element_type %1906, %int26_1654 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %1907, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
%int-2_1655 = torch.constant.int -2
%int-1_1656 = torch.constant.int -1
%1908 = torch.aten.transpose.int %109, %int-2_1655, %int-1_1656 : !torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,1024],f8E4M3FNUZ>
%int4096_1657 = torch.constant.int 4096
%1909 = torch.prim.ListConstruct %566, %int4096_1657 : (!torch.int, !torch.int) -> !torch.list<int>
%1910 = torch.aten.view %1907, %1909 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %1910, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
%1911 = torch.aten.mm %1910, %1908 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,1024],f8E4M3FNUZ> -> !torch.vtensor<[?,1024],f8E4M3FNUZ>
torch.bind_symbolic_shape %1911, [%548], affine_map<()[s0] -> (s0 * 32, 1024)> : !torch.vtensor<[?,1024],f8E4M3FNUZ>
%int1_1658 = torch.constant.int 1
%int1024_1659 = torch.constant.int 1024
%1912 = torch.prim.ListConstruct %int1_1658, %566, %int1024_1659 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%1913 = torch.aten.view %1911, %1912 : !torch.vtensor<[?,1024],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
torch.bind_symbolic_shape %1913, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
%int15_1660 = torch.constant.int 15
%1914 = torch.prims.convert_element_type %1913, %int15_1660 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,1024],bf16>
torch.bind_symbolic_shape %1914, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],bf16>
%int1_1661 = torch.constant.int 1
%int32_1662 = torch.constant.int 32
%int128_1663 = torch.constant.int 128
%1915 = torch.prim.ListConstruct %int1_1661, %566, %int32_1662, %int128_1663 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%1916 = torch.aten.view %1894, %1915 : !torch.vtensor<[1,?,4096],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,32,128],bf16>
torch.bind_symbolic_shape %1916, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
%int1_1664 = torch.constant.int 1
%int8_1665 = torch.constant.int 8
%int128_1666 = torch.constant.int 128
%1917 = torch.prim.ListConstruct %int1_1664, %566, %int8_1665, %int128_1666 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%1918 = torch.aten.view %1904, %1917 : !torch.vtensor<[1,?,1024],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,8,128],bf16>
torch.bind_symbolic_shape %1918, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
%int1_1667 = torch.constant.int 1
%int8_1668 = torch.constant.int 8
%int128_1669 = torch.constant.int 128
%1919 = torch.prim.ListConstruct %int1_1667, %566, %int8_1668, %int128_1669 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%1920 = torch.aten.view %1914, %1919 : !torch.vtensor<[1,?,1024],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,8,128],bf16>
torch.bind_symbolic_shape %1920, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
%int131072_1670 = torch.constant.int 131072
%none_1671 = torch.constant.none
%none_1672 = torch.constant.none
%cpu_1673 = torch.constant.device "cpu"
%false_1674 = torch.constant.bool false
%1921 = torch.aten.arange %int131072_1670, %none_1671, %none_1672, %cpu_1673, %false_1674 : !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[131072],si64>
%int0_1675 = torch.constant.int 0
%int128_1676 = torch.constant.int 128
%none_1677 = torch.constant.none
%none_1678 = torch.constant.none
%cpu_1679 = torch.constant.device "cpu"
%false_1680 = torch.constant.bool false
%1922 = torch.aten.arange.start %int0_1675, %int128_1676, %none_1677, %none_1678, %cpu_1679, %false_1680 : !torch.int, !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[128],si64>
%int2_1681 = torch.constant.int 2
%1923 = torch.aten.floor_divide.Scalar %1922, %int2_1681 : !torch.vtensor<[128],si64>, !torch.int -> !torch.vtensor<[128],si64>
%int6_1682 = torch.constant.int 6
%1924 = torch.prims.convert_element_type %1923, %int6_1682 : !torch.vtensor<[128],si64>, !torch.int -> !torch.vtensor<[128],f32>
%int128_1683 = torch.constant.int 128
%1925 = torch.aten.div.Scalar %1924, %int128_1683 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[128],f32>
%float2.000000e00_1684 = torch.constant.float 2.000000e+00
%1926 = torch.aten.mul.Scalar %1925, %float2.000000e00_1684 : !torch.vtensor<[128],f32>, !torch.float -> !torch.vtensor<[128],f32>
%float5.000000e05_1685 = torch.constant.float 5.000000e+05
%1927 = torch.aten.pow.Scalar %float5.000000e05_1685, %1926 : !torch.float, !torch.vtensor<[128],f32> -> !torch.vtensor<[128],f32>
%1928 = torch.aten.reciprocal %1927 : !torch.vtensor<[128],f32> -> !torch.vtensor<[128],f32>
%float1.000000e00_1686 = torch.constant.float 1.000000e+00
%1929 = torch.aten.mul.Scalar %1928, %float1.000000e00_1686 : !torch.vtensor<[128],f32>, !torch.float -> !torch.vtensor<[128],f32>
%int131072_1687 = torch.constant.int 131072
%int1_1688 = torch.constant.int 1
%1930 = torch.prim.ListConstruct %int131072_1687, %int1_1688 : (!torch.int, !torch.int) -> !torch.list<int>
%1931 = torch.aten.view %1921, %1930 : !torch.vtensor<[131072],si64>, !torch.list<int> -> !torch.vtensor<[131072,1],si64>
%1932 = torch.aten.mul.Tensor %1931, %1929 : !torch.vtensor<[131072,1],si64>, !torch.vtensor<[128],f32> -> !torch.vtensor<[131072,128],f32>
%int1_1689 = torch.constant.int 1
%1933 = torch.aten.size.int %1893, %int1_1689 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.int
%int0_1690 = torch.constant.int 0
%1934 = torch.aten.add.int %int0_1690, %1933 : !torch.int, !torch.int -> !torch.int
%int0_1691 = torch.constant.int 0
%int0_1692 = torch.constant.int 0
%int1_1693 = torch.constant.int 1
%1935 = torch.aten.slice.Tensor %1932, %int0_1691, %int0_1692, %1934, %int1_1693 : !torch.vtensor<[131072,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
torch.bind_symbolic_shape %1935, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
%int1_1694 = torch.constant.int 1
%int0_1695 = torch.constant.int 0
%int9223372036854775807_1696 = torch.constant.int 9223372036854775807
%int1_1697 = torch.constant.int 1
%1936 = torch.aten.slice.Tensor %1935, %int1_1694, %int0_1695, %int9223372036854775807_1696, %int1_1697 : !torch.vtensor<[?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
torch.bind_symbolic_shape %1936, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
%int1_1698 = torch.constant.int 1
%int0_1699 = torch.constant.int 0
%int9223372036854775807_1700 = torch.constant.int 9223372036854775807
%int1_1701 = torch.constant.int 1
%1937 = torch.aten.slice.Tensor %1936, %int1_1698, %int0_1699, %int9223372036854775807_1700, %int1_1701 : !torch.vtensor<[?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
torch.bind_symbolic_shape %1937, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
%int0_1702 = torch.constant.int 0
%1938 = torch.aten.unsqueeze %1937, %int0_1702 : !torch.vtensor<[?,128],f32>, !torch.int -> !torch.vtensor<[1,?,128],f32>
torch.bind_symbolic_shape %1938, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
%int1_1703 = torch.constant.int 1
%int0_1704 = torch.constant.int 0
%int9223372036854775807_1705 = torch.constant.int 9223372036854775807
%int1_1706 = torch.constant.int 1
%1939 = torch.aten.slice.Tensor %1938, %int1_1703, %int0_1704, %int9223372036854775807_1705, %int1_1706 : !torch.vtensor<[1,?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],f32>
torch.bind_symbolic_shape %1939, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
%int2_1707 = torch.constant.int 2
%int0_1708 = torch.constant.int 0
%int9223372036854775807_1709 = torch.constant.int 9223372036854775807
%int1_1710 = torch.constant.int 1
%1940 = torch.aten.slice.Tensor %1939, %int2_1707, %int0_1708, %int9223372036854775807_1709, %int1_1710 : !torch.vtensor<[1,?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],f32>
torch.bind_symbolic_shape %1940, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
%int1_1711 = torch.constant.int 1
%int1_1712 = torch.constant.int 1
%int1_1713 = torch.constant.int 1
%1941 = torch.prim.ListConstruct %int1_1711, %int1_1712, %int1_1713 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%1942 = torch.aten.repeat %1940, %1941 : !torch.vtensor<[1,?,128],f32>, !torch.list<int> -> !torch.vtensor<[1,?,128],f32>
torch.bind_symbolic_shape %1942, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
%int6_1714 = torch.constant.int 6
%1943 = torch.prims.convert_element_type %1916, %int6_1714 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int -> !torch.vtensor<[1,?,32,128],f32>
torch.bind_symbolic_shape %1943, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
%1944 = torch_c.to_builtin_tensor %1943 : !torch.vtensor<[1,?,32,128],f32> -> tensor<1x?x32x128xf32>
%1945 = torch_c.to_builtin_tensor %1942 : !torch.vtensor<[1,?,128],f32> -> tensor<1x?x128xf32>
%1946 = util.call @sharktank_rotary_embedding_1_D_32_128_f32(%1944, %1945) : (tensor<1x?x32x128xf32>, tensor<1x?x128xf32>) -> tensor<1x?x32x128xf32>
%1947 = torch_c.from_builtin_tensor %1946 : tensor<1x?x32x128xf32> -> !torch.vtensor<[1,?,32,128],f32>
torch.bind_symbolic_shape %1947, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
%int15_1715 = torch.constant.int 15
%1948 = torch.prims.convert_element_type %1947, %int15_1715 : !torch.vtensor<[1,?,32,128],f32>, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
torch.bind_symbolic_shape %1948, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
%int131072_1716 = torch.constant.int 131072
%none_1717 = torch.constant.none
%none_1718 = torch.constant.none
%cpu_1719 = torch.constant.device "cpu"
%false_1720 = torch.constant.bool false
%1949 = torch.aten.arange %int131072_1716, %none_1717, %none_1718, %cpu_1719, %false_1720 : !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[131072],si64>
%int0_1721 = torch.constant.int 0
%int128_1722 = torch.constant.int 128
%none_1723 = torch.constant.none
%none_1724 = torch.constant.none
%cpu_1725 = torch.constant.device "cpu"
%false_1726 = torch.constant.bool false
%1950 = torch.aten.arange.start %int0_1721, %int128_1722, %none_1723, %none_1724, %cpu_1725, %false_1726 : !torch.int, !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[128],si64>
%int2_1727 = torch.constant.int 2
%1951 = torch.aten.floor_divide.Scalar %1950, %int2_1727 : !torch.vtensor<[128],si64>, !torch.int -> !torch.vtensor<[128],si64>
%int6_1728 = torch.constant.int 6
%1952 = torch.prims.convert_element_type %1951, %int6_1728 : !torch.vtensor<[128],si64>, !torch.int -> !torch.vtensor<[128],f32>
%int128_1729 = torch.constant.int 128
%1953 = torch.aten.div.Scalar %1952, %int128_1729 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[128],f32>
%float2.000000e00_1730 = torch.constant.float 2.000000e+00
%1954 = torch.aten.mul.Scalar %1953, %float2.000000e00_1730 : !torch.vtensor<[128],f32>, !torch.float -> !torch.vtensor<[128],f32>
%float5.000000e05_1731 = torch.constant.float 5.000000e+05
%1955 = torch.aten.pow.Scalar %float5.000000e05_1731, %1954 : !torch.float, !torch.vtensor<[128],f32> -> !torch.vtensor<[128],f32>
%1956 = torch.aten.reciprocal %1955 : !torch.vtensor<[128],f32> -> !torch.vtensor<[128],f32>
%float1.000000e00_1732 = torch.constant.float 1.000000e+00
%1957 = torch.aten.mul.Scalar %1956, %float1.000000e00_1732 : !torch.vtensor<[128],f32>, !torch.float -> !torch.vtensor<[128],f32>
%int131072_1733 = torch.constant.int 131072
%int1_1734 = torch.constant.int 1
%1958 = torch.prim.ListConstruct %int131072_1733, %int1_1734 : (!torch.int, !torch.int) -> !torch.list<int>
%1959 = torch.aten.view %1949, %1958 : !torch.vtensor<[131072],si64>, !torch.list<int> -> !torch.vtensor<[131072,1],si64>
%1960 = torch.aten.mul.Tensor %1959, %1957 : !torch.vtensor<[131072,1],si64>, !torch.vtensor<[128],f32> -> !torch.vtensor<[131072,128],f32>
%int1_1735 = torch.constant.int 1
%1961 = torch.aten.size.int %1903, %int1_1735 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.int
%int0_1736 = torch.constant.int 0
%1962 = torch.aten.add.int %int0_1736, %1961 : !torch.int, !torch.int -> !torch.int
%int0_1737 = torch.constant.int 0
%int0_1738 = torch.constant.int 0
%int1_1739 = torch.constant.int 1
%1963 = torch.aten.slice.Tensor %1960, %int0_1737, %int0_1738, %1962, %int1_1739 : !torch.vtensor<[131072,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
torch.bind_symbolic_shape %1963, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
%int1_1740 = torch.constant.int 1
%int0_1741 = torch.constant.int 0
%int9223372036854775807_1742 = torch.constant.int 9223372036854775807
%int1_1743 = torch.constant.int 1
%1964 = torch.aten.slice.Tensor %1963, %int1_1740, %int0_1741, %int9223372036854775807_1742, %int1_1743 : !torch.vtensor<[?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
torch.bind_symbolic_shape %1964, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
%int1_1744 = torch.constant.int 1
%int0_1745 = torch.constant.int 0
%int9223372036854775807_1746 = torch.constant.int 9223372036854775807
%int1_1747 = torch.constant.int 1
%1965 = torch.aten.slice.Tensor %1964, %int1_1744, %int0_1745, %int9223372036854775807_1746, %int1_1747 : !torch.vtensor<[?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
torch.bind_symbolic_shape %1965, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
%int0_1748 = torch.constant.int 0
%1966 = torch.aten.unsqueeze %1965, %int0_1748 : !torch.vtensor<[?,128],f32>, !torch.int -> !torch.vtensor<[1,?,128],f32>
torch.bind_symbolic_shape %1966, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
%int1_1749 = torch.constant.int 1
%int0_1750 = torch.constant.int 0
%int9223372036854775807_1751 = torch.constant.int 9223372036854775807
%int1_1752 = torch.constant.int 1
%1967 = torch.aten.slice.Tensor %1966, %int1_1749, %int0_1750, %int9223372036854775807_1751, %int1_1752 : !torch.vtensor<[1,?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],f32>
torch.bind_symbolic_shape %1967, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
%int2_1753 = torch.constant.int 2
%int0_1754 = torch.constant.int 0
%int9223372036854775807_1755 = torch.constant.int 9223372036854775807
%int1_1756 = torch.constant.int 1
%1968 = torch.aten.slice.Tensor %1967, %int2_1753, %int0_1754, %int9223372036854775807_1755, %int1_1756 : !torch.vtensor<[1,?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],f32>
torch.bind_symbolic_shape %1968, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
%int1_1757 = torch.constant.int 1
%int1_1758 = torch.constant.int 1
%int1_1759 = torch.constant.int 1
%1969 = torch.prim.ListConstruct %int1_1757, %int1_1758, %int1_1759 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%1970 = torch.aten.repeat %1968, %1969 : !torch.vtensor<[1,?,128],f32>, !torch.list<int> -> !torch.vtensor<[1,?,128],f32>
torch.bind_symbolic_shape %1970, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
%int6_1760 = torch.constant.int 6
%1971 = torch.prims.convert_element_type %1918, %int6_1760 : !torch.vtensor<[1,?,8,128],bf16>, !torch.int -> !torch.vtensor<[1,?,8,128],f32>
torch.bind_symbolic_shape %1971, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f32>
%1972 = torch_c.to_builtin_tensor %1971 : !torch.vtensor<[1,?,8,128],f32> -> tensor<1x?x8x128xf32>
%1973 = torch_c.to_builtin_tensor %1970 : !torch.vtensor<[1,?,128],f32> -> tensor<1x?x128xf32>
%1974 = util.call @sharktank_rotary_embedding_1_D_8_128_f32(%1972, %1973) : (tensor<1x?x8x128xf32>, tensor<1x?x128xf32>) -> tensor<1x?x8x128xf32>
%1975 = torch_c.from_builtin_tensor %1974 : tensor<1x?x8x128xf32> -> !torch.vtensor<[1,?,8,128],f32>
torch.bind_symbolic_shape %1975, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f32>
%int15_1761 = torch.constant.int 15
%1976 = torch.prims.convert_element_type %1975, %int15_1761 : !torch.vtensor<[1,?,8,128],f32>, !torch.int -> !torch.vtensor<[1,?,8,128],bf16>
torch.bind_symbolic_shape %1976, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
%1977 = torch.aten.div.Tensor %1976, %110 : !torch.vtensor<[1,?,8,128],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,8,128],bf16>
torch.bind_symbolic_shape %1977, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
%float-2.400000e02_1762 = torch.constant.float -2.400000e+02
%float2.400000e02_1763 = torch.constant.float 2.400000e+02
%1978 = torch.aten.clamp %1977, %float-2.400000e02_1762, %float2.400000e02_1763 : !torch.vtensor<[1,?,8,128],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,8,128],bf16>
torch.bind_symbolic_shape %1978, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
%int26_1764 = torch.constant.int 26
%1979 = torch.prims.convert_element_type %1978, %int26_1764 : !torch.vtensor<[1,?,8,128],bf16>, !torch.int -> !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %1979, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
%1980 = torch.aten.div.Tensor %1920, %110 : !torch.vtensor<[1,?,8,128],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,8,128],bf16>
torch.bind_symbolic_shape %1980, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
%float-2.400000e02_1765 = torch.constant.float -2.400000e+02
%float2.400000e02_1766 = torch.constant.float 2.400000e+02
%1981 = torch.aten.clamp %1980, %float-2.400000e02_1765, %float2.400000e02_1766 : !torch.vtensor<[1,?,8,128],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,8,128],bf16>
torch.bind_symbolic_shape %1981, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
%int26_1767 = torch.constant.int 26
%1982 = torch.prims.convert_element_type %1981, %int26_1767 : !torch.vtensor<[1,?,8,128],bf16>, !torch.int -> !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %1982, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
%int64_1768 = torch.constant.int 64
%1983 = torch.aten.mul.Scalar %arg2, %int64_1768 : !torch.vtensor<[1,?],si64>, !torch.int -> !torch.vtensor<[1,?],si64>
torch.bind_symbolic_shape %1983, [%548], affine_map<()[s0] -> (1, s0)> : !torch.vtensor<[1,?],si64>
%int12 = torch.constant.int 12
%int1_1769 = torch.constant.int 1
%1984 = torch.aten.add.Scalar %1983, %int12, %int1_1769 : !torch.vtensor<[1,?],si64>, !torch.int, !torch.int -> !torch.vtensor<[1,?],si64>
torch.bind_symbolic_shape %1984, [%548], affine_map<()[s0] -> (1, s0)> : !torch.vtensor<[1,?],si64>
%int1_1770 = torch.constant.int 1
%int32_1771 = torch.constant.int 32
%int8_1772 = torch.constant.int 8
%int128_1773 = torch.constant.int 128
%1985 = torch.prim.ListConstruct %int1_1770, %670, %int32_1771, %int8_1772, %int128_1773 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%1986 = torch.aten.view %1979, %1985 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %1986, [%548], affine_map<()[s0] -> (1, s0, 32, 8, 128)> : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
%int32_1774 = torch.constant.int 32
%int8_1775 = torch.constant.int 8
%int128_1776 = torch.constant.int 128
%1987 = torch.prim.ListConstruct %670, %int32_1774, %int8_1775, %int128_1776 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%1988 = torch.aten.view %1986, %1987 : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %1988, [%548], affine_map<()[s0] -> (s0, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
%1989 = torch.prim.ListConstruct %670 : (!torch.int) -> !torch.list<int>
%1990 = torch.aten.view %1984, %1989 : !torch.vtensor<[1,?],si64>, !torch.list<int> -> !torch.vtensor<[?],si64>
torch.bind_symbolic_shape %1990, [%548], affine_map<()[s0] -> (s0)> : !torch.vtensor<[?],si64>
%int32_1777 = torch.constant.int 32
%int2_1778 = torch.constant.int 2
%int32_1779 = torch.constant.int 32
%int8_1780 = torch.constant.int 8
%int128_1781 = torch.constant.int 128
%1991 = torch.prim.ListConstruct %661, %int32_1777, %int2_1778, %int32_1779, %int8_1780, %int128_1781 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%1992 = torch.aten.view %1799, %1991 : !torch.vtensor<[?,2097152],f16>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f16>
torch.bind_symbolic_shape %1992, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f16>
%int32_1782 = torch.constant.int 32
%1993 = torch.aten.mul.int %661, %int32_1782 : !torch.int, !torch.int -> !torch.int
%int2_1783 = torch.constant.int 2
%1994 = torch.aten.mul.int %1993, %int2_1783 : !torch.int, !torch.int -> !torch.int
%int32_1784 = torch.constant.int 32
%int8_1785 = torch.constant.int 8
%int128_1786 = torch.constant.int 128
%1995 = torch.prim.ListConstruct %1994, %int32_1784, %int8_1785, %int128_1786 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%1996 = torch.aten.view %1992, %1995 : !torch.vtensor<[?,32,2,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f16>
torch.bind_symbolic_shape %1996, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f16>
%1997 = torch.prim.ListConstruct %1990 : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
%false_1787 = torch.constant.bool false
%1998 = torch.aten.index_put %1996, %1997, %1988, %false_1787 : !torch.vtensor<[?,32,8,128],f16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool -> !torch.vtensor<[?,32,8,128],f16>
torch.bind_symbolic_shape %1998, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f16>
%int32_1788 = torch.constant.int 32
%int2_1789 = torch.constant.int 2
%int32_1790 = torch.constant.int 32
%int8_1791 = torch.constant.int 8
%int128_1792 = torch.constant.int 128
%1999 = torch.prim.ListConstruct %661, %int32_1788, %int2_1789, %int32_1790, %int8_1791, %int128_1792 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2000 = torch.aten.view %1998, %1999 : !torch.vtensor<[?,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f16>
torch.bind_symbolic_shape %2000, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f16>
%int2097152_1793 = torch.constant.int 2097152
%2001 = torch.prim.ListConstruct %661, %int2097152_1793 : (!torch.int, !torch.int) -> !torch.list<int>
%2002 = torch.aten.view %2000, %2001 : !torch.vtensor<[?,32,2,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,2097152],f16>
torch.bind_symbolic_shape %2002, [%549], affine_map<()[s0] -> (s0, 2097152)> : !torch.vtensor<[?,2097152],f16>
%int32_1794 = torch.constant.int 32
%int2_1795 = torch.constant.int 2
%int32_1796 = torch.constant.int 32
%int8_1797 = torch.constant.int 8
%int128_1798 = torch.constant.int 128
%2003 = torch.prim.ListConstruct %661, %int32_1794, %int2_1795, %int32_1796, %int8_1797, %int128_1798 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2004 = torch.aten.view %2002, %2003 : !torch.vtensor<[?,2097152],f16>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f16>
torch.bind_symbolic_shape %2004, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f16>
%int32_1799 = torch.constant.int 32
%int8_1800 = torch.constant.int 8
%int128_1801 = torch.constant.int 128
%2005 = torch.prim.ListConstruct %1994, %int32_1799, %int8_1800, %int128_1801 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2006 = torch.aten.view %2004, %2005 : !torch.vtensor<[?,32,2,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f16>
torch.bind_symbolic_shape %2006, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f16>
%int1_1802 = torch.constant.int 1
%int32_1803 = torch.constant.int 32
%int8_1804 = torch.constant.int 8
%int128_1805 = torch.constant.int 128
%2007 = torch.prim.ListConstruct %int1_1802, %670, %int32_1803, %int8_1804, %int128_1805 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2008 = torch.aten.view %1982, %2007 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %2008, [%548], affine_map<()[s0] -> (1, s0, 32, 8, 128)> : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
%int32_1806 = torch.constant.int 32
%int8_1807 = torch.constant.int 8
%int128_1808 = torch.constant.int 128
%2009 = torch.prim.ListConstruct %670, %int32_1806, %int8_1807, %int128_1808 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2010 = torch.aten.view %2008, %2009 : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %2010, [%548], affine_map<()[s0] -> (s0, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
%int1_1809 = torch.constant.int 1
%int1_1810 = torch.constant.int 1
%2011 = torch.aten.add.Scalar %1984, %int1_1809, %int1_1810 : !torch.vtensor<[1,?],si64>, !torch.int, !torch.int -> !torch.vtensor<[1,?],si64>
torch.bind_symbolic_shape %2011, [%548], affine_map<()[s0] -> (1, s0)> : !torch.vtensor<[1,?],si64>
%2012 = torch.prim.ListConstruct %670 : (!torch.int) -> !torch.list<int>
%2013 = torch.aten.view %2011, %2012 : !torch.vtensor<[1,?],si64>, !torch.list<int> -> !torch.vtensor<[?],si64>
torch.bind_symbolic_shape %2013, [%548], affine_map<()[s0] -> (s0)> : !torch.vtensor<[?],si64>
%2014 = torch.prim.ListConstruct %2013 : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
%false_1811 = torch.constant.bool false
%2015 = torch.aten.index_put %2006, %2014, %2010, %false_1811 : !torch.vtensor<[?,32,8,128],f16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool -> !torch.vtensor<[?,32,8,128],f16>
torch.bind_symbolic_shape %2015, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f16>
%int32_1812 = torch.constant.int 32
%int2_1813 = torch.constant.int 2
%int32_1814 = torch.constant.int 32
%int8_1815 = torch.constant.int 8
%int128_1816 = torch.constant.int 128
%2016 = torch.prim.ListConstruct %661, %int32_1812, %int2_1813, %int32_1814, %int8_1815, %int128_1816 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2017 = torch.aten.view %2015, %2016 : !torch.vtensor<[?,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f16>
torch.bind_symbolic_shape %2017, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f16>
%int2097152_1817 = torch.constant.int 2097152
%2018 = torch.prim.ListConstruct %661, %int2097152_1817 : (!torch.int, !torch.int) -> !torch.list<int>
%2019 = torch.aten.view %2017, %2018 : !torch.vtensor<[?,32,2,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,2097152],f16>
torch.bind_symbolic_shape %2019, [%549], affine_map<()[s0] -> (s0, 2097152)> : !torch.vtensor<[?,2097152],f16>
%int-2_1818 = torch.constant.int -2
%2020 = torch.aten.unsqueeze %1979, %int-2_1818 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %2020, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 1, 128)> : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
%int1_1819 = torch.constant.int 1
%int8_1820 = torch.constant.int 8
%int4_1821 = torch.constant.int 4
%int128_1822 = torch.constant.int 128
%2021 = torch.prim.ListConstruct %int1_1819, %1961, %int8_1820, %int4_1821, %int128_1822 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%false_1823 = torch.constant.bool false
%2022 = torch.aten.expand %2020, %2021, %false_1823 : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %2022, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
%int0_1824 = torch.constant.int 0
%2023 = torch.aten.clone %2022, %int0_1824 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %2023, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
%int1_1825 = torch.constant.int 1
%int32_1826 = torch.constant.int 32
%int128_1827 = torch.constant.int 128
%2024 = torch.prim.ListConstruct %int1_1825, %1961, %int32_1826, %int128_1827 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2025 = torch.aten._unsafe_view %2023, %2024 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %2025, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
%int-2_1828 = torch.constant.int -2
%2026 = torch.aten.unsqueeze %1982, %int-2_1828 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %2026, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 1, 128)> : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
%int1_1829 = torch.constant.int 1
%2027 = torch.aten.size.int %1913, %int1_1829 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.int
%int1_1830 = torch.constant.int 1
%int8_1831 = torch.constant.int 8
%int4_1832 = torch.constant.int 4
%int128_1833 = torch.constant.int 128
%2028 = torch.prim.ListConstruct %int1_1830, %2027, %int8_1831, %int4_1832, %int128_1833 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%false_1834 = torch.constant.bool false
%2029 = torch.aten.expand %2026, %2028, %false_1834 : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %2029, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
%int0_1835 = torch.constant.int 0
%2030 = torch.aten.clone %2029, %int0_1835 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %2030, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
%int1_1836 = torch.constant.int 1
%int32_1837 = torch.constant.int 32
%int128_1838 = torch.constant.int 128
%2031 = torch.prim.ListConstruct %int1_1836, %2027, %int32_1837, %int128_1838 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2032 = torch.aten._unsafe_view %2030, %2031 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %2032, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
%int6_1839 = torch.constant.int 6
%2033 = torch.prims.convert_element_type %2025, %int6_1839 : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,32,128],f32>
torch.bind_symbolic_shape %2033, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
%2034 = torch.aten.mul.Tensor %2033, %110 : !torch.vtensor<[1,?,32,128],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,32,128],f32>
torch.bind_symbolic_shape %2034, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
%int15_1840 = torch.constant.int 15
%2035 = torch.prims.convert_element_type %2034, %int15_1840 : !torch.vtensor<[1,?,32,128],f32>, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
torch.bind_symbolic_shape %2035, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
%int6_1841 = torch.constant.int 6
%2036 = torch.prims.convert_element_type %2032, %int6_1841 : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,32,128],f32>
torch.bind_symbolic_shape %2036, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
%2037 = torch.aten.mul.Tensor %2036, %110 : !torch.vtensor<[1,?,32,128],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,32,128],f32>
torch.bind_symbolic_shape %2037, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
%int15_1842 = torch.constant.int 15
%2038 = torch.prims.convert_element_type %2037, %int15_1842 : !torch.vtensor<[1,?,32,128],f32>, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
torch.bind_symbolic_shape %2038, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
%int1_1843 = torch.constant.int 1
%int2_1844 = torch.constant.int 2
%2039 = torch.aten.transpose.int %1948, %int1_1843, %int2_1844 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
torch.bind_symbolic_shape %2039, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
%int1_1845 = torch.constant.int 1
%int2_1846 = torch.constant.int 2
%2040 = torch.aten.transpose.int %2035, %int1_1845, %int2_1846 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
torch.bind_symbolic_shape %2040, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
%int1_1847 = torch.constant.int 1
%int2_1848 = torch.constant.int 2
%2041 = torch.aten.transpose.int %2038, %int1_1847, %int2_1848 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
torch.bind_symbolic_shape %2041, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
%float0.000000e00_1849 = torch.constant.float 0.000000e+00
%true_1850 = torch.constant.bool true
%none_1851 = torch.constant.none
%none_1852 = torch.constant.none
%2042:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%2039, %2040, %2041, %float0.000000e00_1849, %true_1850, %none_1851, %none_1852) : (!torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,32,?,128],bf16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,32,?],f32>)
torch.bind_symbolic_shape %2042#0, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
%int1_1853 = torch.constant.int 1
%int2_1854 = torch.constant.int 2
%2043 = torch.aten.transpose.int %2042#0, %int1_1853, %int2_1854 : !torch.vtensor<[1,32,?,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
torch.bind_symbolic_shape %2043, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
%int1_1855 = torch.constant.int 1
%int4096_1856 = torch.constant.int 4096
%2044 = torch.prim.ListConstruct %int1_1855, %1933, %int4096_1856 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2045 = torch.aten.view %2043, %2044 : !torch.vtensor<[1,?,32,128],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,4096],bf16>
torch.bind_symbolic_shape %2045, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
%2046 = torch.aten.div.Tensor %2045, %111 : !torch.vtensor<[1,?,4096],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],bf16>
torch.bind_symbolic_shape %2046, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
%float-2.400000e02_1857 = torch.constant.float -2.400000e+02
%float2.400000e02_1858 = torch.constant.float 2.400000e+02
%2047 = torch.aten.clamp %2046, %float-2.400000e02_1857, %float2.400000e02_1858 : !torch.vtensor<[1,?,4096],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],bf16>
torch.bind_symbolic_shape %2047, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
%int26_1859 = torch.constant.int 26
%2048 = torch.prims.convert_element_type %2047, %int26_1859 : !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %2048, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
%int-2_1860 = torch.constant.int -2
%int-1_1861 = torch.constant.int -1
%2049 = torch.aten.transpose.int %112, %int-2_1860, %int-1_1861 : !torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%int4096_1862 = torch.constant.int 4096
%2050 = torch.prim.ListConstruct %1933, %int4096_1862 : (!torch.int, !torch.int) -> !torch.list<int>
%2051 = torch.aten.view %2048, %2050 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %2051, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
%2052 = torch.aten.mm %2051, %2049 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %2052, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
%int1_1863 = torch.constant.int 1
%int4096_1864 = torch.constant.int 4096
%2053 = torch.prim.ListConstruct %int1_1863, %1933, %int4096_1864 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2054 = torch.aten.view %2052, %2053 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %2054, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
%int15_1865 = torch.constant.int 15
%2055 = torch.prims.convert_element_type %2054, %int15_1865 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
torch.bind_symbolic_shape %2055, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
%int1_1866 = torch.constant.int 1
%2056 = torch.aten.add.Tensor %1877, %2055, %int1_1866 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %2056, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%int2_1867 = torch.constant.int 2
%2057 = torch.aten.pow.Tensor_Scalar %2056, %int2_1867 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %2057, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%int-1_1868 = torch.constant.int -1
%2058 = torch.prim.ListConstruct %int-1_1868 : (!torch.int) -> !torch.list<int>
%true_1869 = torch.constant.bool true
%none_1870 = torch.constant.none
%2059 = torch.aten.mean.dim %2057, %2058, %true_1869, %none_1870 : !torch.vtensor<[1,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,?,1],f32>
torch.bind_symbolic_shape %2059, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
%float1.000000e-05_1871 = torch.constant.float 1.000000e-05
%int1_1872 = torch.constant.int 1
%2060 = torch.aten.add.Scalar %2059, %float1.000000e-05_1871, %int1_1872 : !torch.vtensor<[1,?,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,?,1],f32>
torch.bind_symbolic_shape %2060, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
%2061 = torch.aten.rsqrt %2060 : !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,1],f32>
torch.bind_symbolic_shape %2061, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
%2062 = torch.aten.mul.Tensor %2056, %2061 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %2062, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%2063 = torch.aten.mul.Tensor %113, %2062 : !torch.vtensor<[4096],bf16>, !torch.vtensor<[1,?,4096],f32> -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %2063, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%2064 = torch.aten.div.Tensor %2063, %114 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %2064, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%float-2.400000e02_1873 = torch.constant.float -2.400000e+02
%float2.400000e02_1874 = torch.constant.float 2.400000e+02
%2065 = torch.aten.clamp %2064, %float-2.400000e02_1873, %float2.400000e02_1874 : !torch.vtensor<[1,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %2065, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%int26_1875 = torch.constant.int 26
%2066 = torch.prims.convert_element_type %2065, %int26_1875 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %2066, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
%int-2_1876 = torch.constant.int -2
%int-1_1877 = torch.constant.int -1
%2067 = torch.aten.transpose.int %115, %int-2_1876, %int-1_1877 : !torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
%int4096_1878 = torch.constant.int 4096
%2068 = torch.prim.ListConstruct %566, %int4096_1878 : (!torch.int, !torch.int) -> !torch.list<int>
%2069 = torch.aten.view %2066, %2068 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %2069, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
%2070 = torch.aten.mm %2069, %2067 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,14336],f8E4M3FNUZ> -> !torch.vtensor<[?,14336],f8E4M3FNUZ>
torch.bind_symbolic_shape %2070, [%548], affine_map<()[s0] -> (s0 * 32, 14336)> : !torch.vtensor<[?,14336],f8E4M3FNUZ>
%int1_1879 = torch.constant.int 1
%int14336_1880 = torch.constant.int 14336
%2071 = torch.prim.ListConstruct %int1_1879, %566, %int14336_1880 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2072 = torch.aten.view %2070, %2071 : !torch.vtensor<[?,14336],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
torch.bind_symbolic_shape %2072, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
%int15_1881 = torch.constant.int 15
%2073 = torch.prims.convert_element_type %2072, %int15_1881 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,14336],bf16>
torch.bind_symbolic_shape %2073, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
%2074 = torch.aten.silu %2073 : !torch.vtensor<[1,?,14336],bf16> -> !torch.vtensor<[1,?,14336],bf16>
torch.bind_symbolic_shape %2074, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
%2075 = torch.aten.div.Tensor %2063, %116 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %2075, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%float-2.400000e02_1882 = torch.constant.float -2.400000e+02
%float2.400000e02_1883 = torch.constant.float 2.400000e+02
%2076 = torch.aten.clamp %2075, %float-2.400000e02_1882, %float2.400000e02_1883 : !torch.vtensor<[1,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %2076, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%int26_1884 = torch.constant.int 26
%2077 = torch.prims.convert_element_type %2076, %int26_1884 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %2077, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
%int-2_1885 = torch.constant.int -2
%int-1_1886 = torch.constant.int -1
%2078 = torch.aten.transpose.int %117, %int-2_1885, %int-1_1886 : !torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
%int4096_1887 = torch.constant.int 4096
%2079 = torch.prim.ListConstruct %566, %int4096_1887 : (!torch.int, !torch.int) -> !torch.list<int>
%2080 = torch.aten.view %2077, %2079 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %2080, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
%2081 = torch.aten.mm %2080, %2078 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,14336],f8E4M3FNUZ> -> !torch.vtensor<[?,14336],f8E4M3FNUZ>
torch.bind_symbolic_shape %2081, [%548], affine_map<()[s0] -> (s0 * 32, 14336)> : !torch.vtensor<[?,14336],f8E4M3FNUZ>
%int1_1888 = torch.constant.int 1
%int14336_1889 = torch.constant.int 14336
%2082 = torch.prim.ListConstruct %int1_1888, %566, %int14336_1889 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2083 = torch.aten.view %2081, %2082 : !torch.vtensor<[?,14336],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
torch.bind_symbolic_shape %2083, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
%int15_1890 = torch.constant.int 15
%2084 = torch.prims.convert_element_type %2083, %int15_1890 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,14336],bf16>
torch.bind_symbolic_shape %2084, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
%2085 = torch.aten.mul.Tensor %2074, %2084 : !torch.vtensor<[1,?,14336],bf16>, !torch.vtensor<[1,?,14336],bf16> -> !torch.vtensor<[1,?,14336],bf16>
torch.bind_symbolic_shape %2085, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
%2086 = torch.aten.div.Tensor %2085, %118 : !torch.vtensor<[1,?,14336],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,14336],bf16>
torch.bind_symbolic_shape %2086, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
%float-2.400000e02_1891 = torch.constant.float -2.400000e+02
%float2.400000e02_1892 = torch.constant.float 2.400000e+02
%2087 = torch.aten.clamp %2086, %float-2.400000e02_1891, %float2.400000e02_1892 : !torch.vtensor<[1,?,14336],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,14336],bf16>
torch.bind_symbolic_shape %2087, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
%int26_1893 = torch.constant.int 26
%2088 = torch.prims.convert_element_type %2087, %int26_1893 : !torch.vtensor<[1,?,14336],bf16>, !torch.int -> !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
torch.bind_symbolic_shape %2088, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
%int-2_1894 = torch.constant.int -2
%int-1_1895 = torch.constant.int -1
%2089 = torch.aten.transpose.int %119, %int-2_1894, %int-1_1895 : !torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%int1_1896 = torch.constant.int 1
%2090 = torch.aten.size.int %2072, %int1_1896 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.int -> !torch.int
%int14336_1897 = torch.constant.int 14336
%2091 = torch.prim.ListConstruct %2090, %int14336_1897 : (!torch.int, !torch.int) -> !torch.list<int>
%2092 = torch.aten.view %2088, %2091 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,14336],f8E4M3FNUZ>
torch.bind_symbolic_shape %2092, [%548], affine_map<()[s0] -> (s0 * 32, 14336)> : !torch.vtensor<[?,14336],f8E4M3FNUZ>
%2093 = torch.aten.mm %2092, %2089 : !torch.vtensor<[?,14336],f8E4M3FNUZ>, !torch.vtensor<[14336,4096],f8E4M3FNUZ> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %2093, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
%int1_1898 = torch.constant.int 1
%int4096_1899 = torch.constant.int 4096
%2094 = torch.prim.ListConstruct %int1_1898, %2090, %int4096_1899 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2095 = torch.aten.view %2093, %2094 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %2095, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
%int15_1900 = torch.constant.int 15
%2096 = torch.prims.convert_element_type %2095, %int15_1900 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
torch.bind_symbolic_shape %2096, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
%int1_1901 = torch.constant.int 1
%2097 = torch.aten.add.Tensor %2056, %2096, %int1_1901 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %2097, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%int2_1902 = torch.constant.int 2
%2098 = torch.aten.pow.Tensor_Scalar %2097, %int2_1902 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %2098, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%int-1_1903 = torch.constant.int -1
%2099 = torch.prim.ListConstruct %int-1_1903 : (!torch.int) -> !torch.list<int>
%true_1904 = torch.constant.bool true
%none_1905 = torch.constant.none
%2100 = torch.aten.mean.dim %2098, %2099, %true_1904, %none_1905 : !torch.vtensor<[1,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,?,1],f32>
torch.bind_symbolic_shape %2100, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
%float1.000000e-05_1906 = torch.constant.float 1.000000e-05
%int1_1907 = torch.constant.int 1
%2101 = torch.aten.add.Scalar %2100, %float1.000000e-05_1906, %int1_1907 : !torch.vtensor<[1,?,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,?,1],f32>
torch.bind_symbolic_shape %2101, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
%2102 = torch.aten.rsqrt %2101 : !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,1],f32>
torch.bind_symbolic_shape %2102, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
%2103 = torch.aten.mul.Tensor %2097, %2102 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %2103, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%2104 = torch.aten.mul.Tensor %120, %2103 : !torch.vtensor<[4096],bf16>, !torch.vtensor<[1,?,4096],f32> -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %2104, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%2105 = torch.aten.div.Tensor %2104, %121 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %2105, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%float-2.400000e02_1908 = torch.constant.float -2.400000e+02
%float2.400000e02_1909 = torch.constant.float 2.400000e+02
%2106 = torch.aten.clamp %2105, %float-2.400000e02_1908, %float2.400000e02_1909 : !torch.vtensor<[1,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %2106, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%int26_1910 = torch.constant.int 26
%2107 = torch.prims.convert_element_type %2106, %int26_1910 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %2107, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
%int-2_1911 = torch.constant.int -2
%int-1_1912 = torch.constant.int -1
%2108 = torch.aten.transpose.int %122, %int-2_1911, %int-1_1912 : !torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%int4096_1913 = torch.constant.int 4096
%2109 = torch.prim.ListConstruct %566, %int4096_1913 : (!torch.int, !torch.int) -> !torch.list<int>
%2110 = torch.aten.view %2107, %2109 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %2110, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
%2111 = torch.aten.mm %2110, %2108 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %2111, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
%int1_1914 = torch.constant.int 1
%int4096_1915 = torch.constant.int 4096
%2112 = torch.prim.ListConstruct %int1_1914, %566, %int4096_1915 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2113 = torch.aten.view %2111, %2112 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %2113, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
%int15_1916 = torch.constant.int 15
%2114 = torch.prims.convert_element_type %2113, %int15_1916 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
torch.bind_symbolic_shape %2114, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
%2115 = torch.aten.div.Tensor %2104, %123 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %2115, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%float-2.400000e02_1917 = torch.constant.float -2.400000e+02
%float2.400000e02_1918 = torch.constant.float 2.400000e+02
%2116 = torch.aten.clamp %2115, %float-2.400000e02_1917, %float2.400000e02_1918 : !torch.vtensor<[1,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %2116, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%int26_1919 = torch.constant.int 26
%2117 = torch.prims.convert_element_type %2116, %int26_1919 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %2117, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
%int-2_1920 = torch.constant.int -2
%int-1_1921 = torch.constant.int -1
%2118 = torch.aten.transpose.int %124, %int-2_1920, %int-1_1921 : !torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,1024],f8E4M3FNUZ>
%int4096_1922 = torch.constant.int 4096
%2119 = torch.prim.ListConstruct %566, %int4096_1922 : (!torch.int, !torch.int) -> !torch.list<int>
%2120 = torch.aten.view %2117, %2119 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %2120, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
%2121 = torch.aten.mm %2120, %2118 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,1024],f8E4M3FNUZ> -> !torch.vtensor<[?,1024],f8E4M3FNUZ>
torch.bind_symbolic_shape %2121, [%548], affine_map<()[s0] -> (s0 * 32, 1024)> : !torch.vtensor<[?,1024],f8E4M3FNUZ>
%int1_1923 = torch.constant.int 1
%int1024_1924 = torch.constant.int 1024
%2122 = torch.prim.ListConstruct %int1_1923, %566, %int1024_1924 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2123 = torch.aten.view %2121, %2122 : !torch.vtensor<[?,1024],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
torch.bind_symbolic_shape %2123, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
%int15_1925 = torch.constant.int 15
%2124 = torch.prims.convert_element_type %2123, %int15_1925 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,1024],bf16>
torch.bind_symbolic_shape %2124, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],bf16>
%2125 = torch.aten.div.Tensor %2104, %125 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %2125, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%float-2.400000e02_1926 = torch.constant.float -2.400000e+02
%float2.400000e02_1927 = torch.constant.float 2.400000e+02
%2126 = torch.aten.clamp %2125, %float-2.400000e02_1926, %float2.400000e02_1927 : !torch.vtensor<[1,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %2126, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%int26_1928 = torch.constant.int 26
%2127 = torch.prims.convert_element_type %2126, %int26_1928 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %2127, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
%int-2_1929 = torch.constant.int -2
%int-1_1930 = torch.constant.int -1
%2128 = torch.aten.transpose.int %126, %int-2_1929, %int-1_1930 : !torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,1024],f8E4M3FNUZ>
%int4096_1931 = torch.constant.int 4096
%2129 = torch.prim.ListConstruct %566, %int4096_1931 : (!torch.int, !torch.int) -> !torch.list<int>
%2130 = torch.aten.view %2127, %2129 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %2130, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
%2131 = torch.aten.mm %2130, %2128 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,1024],f8E4M3FNUZ> -> !torch.vtensor<[?,1024],f8E4M3FNUZ>
torch.bind_symbolic_shape %2131, [%548], affine_map<()[s0] -> (s0 * 32, 1024)> : !torch.vtensor<[?,1024],f8E4M3FNUZ>
%int1_1932 = torch.constant.int 1
%int1024_1933 = torch.constant.int 1024
%2132 = torch.prim.ListConstruct %int1_1932, %566, %int1024_1933 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2133 = torch.aten.view %2131, %2132 : !torch.vtensor<[?,1024],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
torch.bind_symbolic_shape %2133, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
%int15_1934 = torch.constant.int 15
%2134 = torch.prims.convert_element_type %2133, %int15_1934 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,1024],bf16>
torch.bind_symbolic_shape %2134, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],bf16>
%int1_1935 = torch.constant.int 1
%int32_1936 = torch.constant.int 32
%int128_1937 = torch.constant.int 128
%2135 = torch.prim.ListConstruct %int1_1935, %566, %int32_1936, %int128_1937 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2136 = torch.aten.view %2114, %2135 : !torch.vtensor<[1,?,4096],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,32,128],bf16>
torch.bind_symbolic_shape %2136, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
%int1_1938 = torch.constant.int 1
%int8_1939 = torch.constant.int 8
%int128_1940 = torch.constant.int 128
%2137 = torch.prim.ListConstruct %int1_1938, %566, %int8_1939, %int128_1940 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2138 = torch.aten.view %2124, %2137 : !torch.vtensor<[1,?,1024],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,8,128],bf16>
torch.bind_symbolic_shape %2138, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
%int1_1941 = torch.constant.int 1
%int8_1942 = torch.constant.int 8
%int128_1943 = torch.constant.int 128
%2139 = torch.prim.ListConstruct %int1_1941, %566, %int8_1942, %int128_1943 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2140 = torch.aten.view %2134, %2139 : !torch.vtensor<[1,?,1024],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,8,128],bf16>
torch.bind_symbolic_shape %2140, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
%int131072_1944 = torch.constant.int 131072
%none_1945 = torch.constant.none
%none_1946 = torch.constant.none
%cpu_1947 = torch.constant.device "cpu"
%false_1948 = torch.constant.bool false
%2141 = torch.aten.arange %int131072_1944, %none_1945, %none_1946, %cpu_1947, %false_1948 : !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[131072],si64>
%int0_1949 = torch.constant.int 0
%int128_1950 = torch.constant.int 128
%none_1951 = torch.constant.none
%none_1952 = torch.constant.none
%cpu_1953 = torch.constant.device "cpu"
%false_1954 = torch.constant.bool false
%2142 = torch.aten.arange.start %int0_1949, %int128_1950, %none_1951, %none_1952, %cpu_1953, %false_1954 : !torch.int, !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[128],si64>
%int2_1955 = torch.constant.int 2
%2143 = torch.aten.floor_divide.Scalar %2142, %int2_1955 : !torch.vtensor<[128],si64>, !torch.int -> !torch.vtensor<[128],si64>
%int6_1956 = torch.constant.int 6
%2144 = torch.prims.convert_element_type %2143, %int6_1956 : !torch.vtensor<[128],si64>, !torch.int -> !torch.vtensor<[128],f32>
%int128_1957 = torch.constant.int 128
%2145 = torch.aten.div.Scalar %2144, %int128_1957 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[128],f32>
%float2.000000e00_1958 = torch.constant.float 2.000000e+00
%2146 = torch.aten.mul.Scalar %2145, %float2.000000e00_1958 : !torch.vtensor<[128],f32>, !torch.float -> !torch.vtensor<[128],f32>
%float5.000000e05_1959 = torch.constant.float 5.000000e+05
%2147 = torch.aten.pow.Scalar %float5.000000e05_1959, %2146 : !torch.float, !torch.vtensor<[128],f32> -> !torch.vtensor<[128],f32>
%2148 = torch.aten.reciprocal %2147 : !torch.vtensor<[128],f32> -> !torch.vtensor<[128],f32>
%float1.000000e00_1960 = torch.constant.float 1.000000e+00
%2149 = torch.aten.mul.Scalar %2148, %float1.000000e00_1960 : !torch.vtensor<[128],f32>, !torch.float -> !torch.vtensor<[128],f32>
%int131072_1961 = torch.constant.int 131072
%int1_1962 = torch.constant.int 1
%2150 = torch.prim.ListConstruct %int131072_1961, %int1_1962 : (!torch.int, !torch.int) -> !torch.list<int>
%2151 = torch.aten.view %2141, %2150 : !torch.vtensor<[131072],si64>, !torch.list<int> -> !torch.vtensor<[131072,1],si64>
%2152 = torch.aten.mul.Tensor %2151, %2149 : !torch.vtensor<[131072,1],si64>, !torch.vtensor<[128],f32> -> !torch.vtensor<[131072,128],f32>
%int1_1963 = torch.constant.int 1
%2153 = torch.aten.size.int %2113, %int1_1963 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.int
%int0_1964 = torch.constant.int 0
%2154 = torch.aten.add.int %int0_1964, %2153 : !torch.int, !torch.int -> !torch.int
%int0_1965 = torch.constant.int 0
%int0_1966 = torch.constant.int 0
%int1_1967 = torch.constant.int 1
%2155 = torch.aten.slice.Tensor %2152, %int0_1965, %int0_1966, %2154, %int1_1967 : !torch.vtensor<[131072,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
torch.bind_symbolic_shape %2155, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
%int1_1968 = torch.constant.int 1
%int0_1969 = torch.constant.int 0
%int9223372036854775807_1970 = torch.constant.int 9223372036854775807
%int1_1971 = torch.constant.int 1
%2156 = torch.aten.slice.Tensor %2155, %int1_1968, %int0_1969, %int9223372036854775807_1970, %int1_1971 : !torch.vtensor<[?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
torch.bind_symbolic_shape %2156, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
%int1_1972 = torch.constant.int 1
%int0_1973 = torch.constant.int 0
%int9223372036854775807_1974 = torch.constant.int 9223372036854775807
%int1_1975 = torch.constant.int 1
%2157 = torch.aten.slice.Tensor %2156, %int1_1972, %int0_1973, %int9223372036854775807_1974, %int1_1975 : !torch.vtensor<[?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
torch.bind_symbolic_shape %2157, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
%int0_1976 = torch.constant.int 0
%2158 = torch.aten.unsqueeze %2157, %int0_1976 : !torch.vtensor<[?,128],f32>, !torch.int -> !torch.vtensor<[1,?,128],f32>
torch.bind_symbolic_shape %2158, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
%int1_1977 = torch.constant.int 1
%int0_1978 = torch.constant.int 0
%int9223372036854775807_1979 = torch.constant.int 9223372036854775807
%int1_1980 = torch.constant.int 1
%2159 = torch.aten.slice.Tensor %2158, %int1_1977, %int0_1978, %int9223372036854775807_1979, %int1_1980 : !torch.vtensor<[1,?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],f32>
torch.bind_symbolic_shape %2159, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
%int2_1981 = torch.constant.int 2
%int0_1982 = torch.constant.int 0
%int9223372036854775807_1983 = torch.constant.int 9223372036854775807
%int1_1984 = torch.constant.int 1
%2160 = torch.aten.slice.Tensor %2159, %int2_1981, %int0_1982, %int9223372036854775807_1983, %int1_1984 : !torch.vtensor<[1,?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],f32>
torch.bind_symbolic_shape %2160, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
%int1_1985 = torch.constant.int 1
%int1_1986 = torch.constant.int 1
%int1_1987 = torch.constant.int 1
%2161 = torch.prim.ListConstruct %int1_1985, %int1_1986, %int1_1987 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2162 = torch.aten.repeat %2160, %2161 : !torch.vtensor<[1,?,128],f32>, !torch.list<int> -> !torch.vtensor<[1,?,128],f32>
torch.bind_symbolic_shape %2162, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
%int6_1988 = torch.constant.int 6
%2163 = torch.prims.convert_element_type %2136, %int6_1988 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int -> !torch.vtensor<[1,?,32,128],f32>
torch.bind_symbolic_shape %2163, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
%2164 = torch_c.to_builtin_tensor %2163 : !torch.vtensor<[1,?,32,128],f32> -> tensor<1x?x32x128xf32>
%2165 = torch_c.to_builtin_tensor %2162 : !torch.vtensor<[1,?,128],f32> -> tensor<1x?x128xf32>
%2166 = util.call @sharktank_rotary_embedding_1_D_32_128_f32(%2164, %2165) : (tensor<1x?x32x128xf32>, tensor<1x?x128xf32>) -> tensor<1x?x32x128xf32>
%2167 = torch_c.from_builtin_tensor %2166 : tensor<1x?x32x128xf32> -> !torch.vtensor<[1,?,32,128],f32>
torch.bind_symbolic_shape %2167, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
%int15_1989 = torch.constant.int 15
%2168 = torch.prims.convert_element_type %2167, %int15_1989 : !torch.vtensor<[1,?,32,128],f32>, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
torch.bind_symbolic_shape %2168, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
%int131072_1990 = torch.constant.int 131072
%none_1991 = torch.constant.none
%none_1992 = torch.constant.none
%cpu_1993 = torch.constant.device "cpu"
%false_1994 = torch.constant.bool false
%2169 = torch.aten.arange %int131072_1990, %none_1991, %none_1992, %cpu_1993, %false_1994 : !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[131072],si64>
%int0_1995 = torch.constant.int 0
%int128_1996 = torch.constant.int 128
%none_1997 = torch.constant.none
%none_1998 = torch.constant.none
%cpu_1999 = torch.constant.device "cpu"
%false_2000 = torch.constant.bool false
%2170 = torch.aten.arange.start %int0_1995, %int128_1996, %none_1997, %none_1998, %cpu_1999, %false_2000 : !torch.int, !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[128],si64>
%int2_2001 = torch.constant.int 2
%2171 = torch.aten.floor_divide.Scalar %2170, %int2_2001 : !torch.vtensor<[128],si64>, !torch.int -> !torch.vtensor<[128],si64>
%int6_2002 = torch.constant.int 6
%2172 = torch.prims.convert_element_type %2171, %int6_2002 : !torch.vtensor<[128],si64>, !torch.int -> !torch.vtensor<[128],f32>
%int128_2003 = torch.constant.int 128
%2173 = torch.aten.div.Scalar %2172, %int128_2003 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[128],f32>
%float2.000000e00_2004 = torch.constant.float 2.000000e+00
%2174 = torch.aten.mul.Scalar %2173, %float2.000000e00_2004 : !torch.vtensor<[128],f32>, !torch.float -> !torch.vtensor<[128],f32>
%float5.000000e05_2005 = torch.constant.float 5.000000e+05
%2175 = torch.aten.pow.Scalar %float5.000000e05_2005, %2174 : !torch.float, !torch.vtensor<[128],f32> -> !torch.vtensor<[128],f32>
%2176 = torch.aten.reciprocal %2175 : !torch.vtensor<[128],f32> -> !torch.vtensor<[128],f32>
%float1.000000e00_2006 = torch.constant.float 1.000000e+00
%2177 = torch.aten.mul.Scalar %2176, %float1.000000e00_2006 : !torch.vtensor<[128],f32>, !torch.float -> !torch.vtensor<[128],f32>
%int131072_2007 = torch.constant.int 131072
%int1_2008 = torch.constant.int 1
%2178 = torch.prim.ListConstruct %int131072_2007, %int1_2008 : (!torch.int, !torch.int) -> !torch.list<int>
%2179 = torch.aten.view %2169, %2178 : !torch.vtensor<[131072],si64>, !torch.list<int> -> !torch.vtensor<[131072,1],si64>
%2180 = torch.aten.mul.Tensor %2179, %2177 : !torch.vtensor<[131072,1],si64>, !torch.vtensor<[128],f32> -> !torch.vtensor<[131072,128],f32>
%int1_2009 = torch.constant.int 1
%2181 = torch.aten.size.int %2123, %int1_2009 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.int
%int0_2010 = torch.constant.int 0
%2182 = torch.aten.add.int %int0_2010, %2181 : !torch.int, !torch.int -> !torch.int
%int0_2011 = torch.constant.int 0
%int0_2012 = torch.constant.int 0
%int1_2013 = torch.constant.int 1
%2183 = torch.aten.slice.Tensor %2180, %int0_2011, %int0_2012, %2182, %int1_2013 : !torch.vtensor<[131072,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
torch.bind_symbolic_shape %2183, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
%int1_2014 = torch.constant.int 1
%int0_2015 = torch.constant.int 0
%int9223372036854775807_2016 = torch.constant.int 9223372036854775807
%int1_2017 = torch.constant.int 1
%2184 = torch.aten.slice.Tensor %2183, %int1_2014, %int0_2015, %int9223372036854775807_2016, %int1_2017 : !torch.vtensor<[?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
torch.bind_symbolic_shape %2184, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
%int1_2018 = torch.constant.int 1
%int0_2019 = torch.constant.int 0
%int9223372036854775807_2020 = torch.constant.int 9223372036854775807
%int1_2021 = torch.constant.int 1
%2185 = torch.aten.slice.Tensor %2184, %int1_2018, %int0_2019, %int9223372036854775807_2020, %int1_2021 : !torch.vtensor<[?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
torch.bind_symbolic_shape %2185, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
%int0_2022 = torch.constant.int 0
%2186 = torch.aten.unsqueeze %2185, %int0_2022 : !torch.vtensor<[?,128],f32>, !torch.int -> !torch.vtensor<[1,?,128],f32>
torch.bind_symbolic_shape %2186, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
%int1_2023 = torch.constant.int 1
%int0_2024 = torch.constant.int 0
%int9223372036854775807_2025 = torch.constant.int 9223372036854775807
%int1_2026 = torch.constant.int 1
%2187 = torch.aten.slice.Tensor %2186, %int1_2023, %int0_2024, %int9223372036854775807_2025, %int1_2026 : !torch.vtensor<[1,?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],f32>
torch.bind_symbolic_shape %2187, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
%int2_2027 = torch.constant.int 2
%int0_2028 = torch.constant.int 0
%int9223372036854775807_2029 = torch.constant.int 9223372036854775807
%int1_2030 = torch.constant.int 1
%2188 = torch.aten.slice.Tensor %2187, %int2_2027, %int0_2028, %int9223372036854775807_2029, %int1_2030 : !torch.vtensor<[1,?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],f32>
torch.bind_symbolic_shape %2188, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
%int1_2031 = torch.constant.int 1
%int1_2032 = torch.constant.int 1
%int1_2033 = torch.constant.int 1
%2189 = torch.prim.ListConstruct %int1_2031, %int1_2032, %int1_2033 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2190 = torch.aten.repeat %2188, %2189 : !torch.vtensor<[1,?,128],f32>, !torch.list<int> -> !torch.vtensor<[1,?,128],f32>
torch.bind_symbolic_shape %2190, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
%int6_2034 = torch.constant.int 6
%2191 = torch.prims.convert_element_type %2138, %int6_2034 : !torch.vtensor<[1,?,8,128],bf16>, !torch.int -> !torch.vtensor<[1,?,8,128],f32>
torch.bind_symbolic_shape %2191, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f32>
%2192 = torch_c.to_builtin_tensor %2191 : !torch.vtensor<[1,?,8,128],f32> -> tensor<1x?x8x128xf32>
%2193 = torch_c.to_builtin_tensor %2190 : !torch.vtensor<[1,?,128],f32> -> tensor<1x?x128xf32>
%2194 = util.call @sharktank_rotary_embedding_1_D_8_128_f32(%2192, %2193) : (tensor<1x?x8x128xf32>, tensor<1x?x128xf32>) -> tensor<1x?x8x128xf32>
%2195 = torch_c.from_builtin_tensor %2194 : tensor<1x?x8x128xf32> -> !torch.vtensor<[1,?,8,128],f32>
torch.bind_symbolic_shape %2195, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f32>
%int15_2035 = torch.constant.int 15
%2196 = torch.prims.convert_element_type %2195, %int15_2035 : !torch.vtensor<[1,?,8,128],f32>, !torch.int -> !torch.vtensor<[1,?,8,128],bf16>
torch.bind_symbolic_shape %2196, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
%2197 = torch.aten.div.Tensor %2196, %127 : !torch.vtensor<[1,?,8,128],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,8,128],bf16>
torch.bind_symbolic_shape %2197, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
%float-2.400000e02_2036 = torch.constant.float -2.400000e+02
%float2.400000e02_2037 = torch.constant.float 2.400000e+02
%2198 = torch.aten.clamp %2197, %float-2.400000e02_2036, %float2.400000e02_2037 : !torch.vtensor<[1,?,8,128],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,8,128],bf16>
torch.bind_symbolic_shape %2198, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
%int26_2038 = torch.constant.int 26
%2199 = torch.prims.convert_element_type %2198, %int26_2038 : !torch.vtensor<[1,?,8,128],bf16>, !torch.int -> !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %2199, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
%2200 = torch.aten.div.Tensor %2140, %127 : !torch.vtensor<[1,?,8,128],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,8,128],bf16>
torch.bind_symbolic_shape %2200, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
%float-2.400000e02_2039 = torch.constant.float -2.400000e+02
%float2.400000e02_2040 = torch.constant.float 2.400000e+02
%2201 = torch.aten.clamp %2200, %float-2.400000e02_2039, %float2.400000e02_2040 : !torch.vtensor<[1,?,8,128],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,8,128],bf16>
torch.bind_symbolic_shape %2201, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
%int26_2041 = torch.constant.int 26
%2202 = torch.prims.convert_element_type %2201, %int26_2041 : !torch.vtensor<[1,?,8,128],bf16>, !torch.int -> !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %2202, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
%int64_2042 = torch.constant.int 64
%2203 = torch.aten.mul.Scalar %arg2, %int64_2042 : !torch.vtensor<[1,?],si64>, !torch.int -> !torch.vtensor<[1,?],si64>
torch.bind_symbolic_shape %2203, [%548], affine_map<()[s0] -> (1, s0)> : !torch.vtensor<[1,?],si64>
%int14 = torch.constant.int 14
%int1_2043 = torch.constant.int 1
%2204 = torch.aten.add.Scalar %2203, %int14, %int1_2043 : !torch.vtensor<[1,?],si64>, !torch.int, !torch.int -> !torch.vtensor<[1,?],si64>
torch.bind_symbolic_shape %2204, [%548], affine_map<()[s0] -> (1, s0)> : !torch.vtensor<[1,?],si64>
%int1_2044 = torch.constant.int 1
%int32_2045 = torch.constant.int 32
%int8_2046 = torch.constant.int 8
%int128_2047 = torch.constant.int 128
%2205 = torch.prim.ListConstruct %int1_2044, %670, %int32_2045, %int8_2046, %int128_2047 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2206 = torch.aten.view %2199, %2205 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %2206, [%548], affine_map<()[s0] -> (1, s0, 32, 8, 128)> : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
%int32_2048 = torch.constant.int 32
%int8_2049 = torch.constant.int 8
%int128_2050 = torch.constant.int 128
%2207 = torch.prim.ListConstruct %670, %int32_2048, %int8_2049, %int128_2050 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2208 = torch.aten.view %2206, %2207 : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %2208, [%548], affine_map<()[s0] -> (s0, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
%2209 = torch.prim.ListConstruct %670 : (!torch.int) -> !torch.list<int>
%2210 = torch.aten.view %2204, %2209 : !torch.vtensor<[1,?],si64>, !torch.list<int> -> !torch.vtensor<[?],si64>
torch.bind_symbolic_shape %2210, [%548], affine_map<()[s0] -> (s0)> : !torch.vtensor<[?],si64>
%int32_2051 = torch.constant.int 32
%int2_2052 = torch.constant.int 2
%int32_2053 = torch.constant.int 32
%int8_2054 = torch.constant.int 8
%int128_2055 = torch.constant.int 128
%2211 = torch.prim.ListConstruct %661, %int32_2051, %int2_2052, %int32_2053, %int8_2054, %int128_2055 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2212 = torch.aten.view %2019, %2211 : !torch.vtensor<[?,2097152],f16>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f16>
torch.bind_symbolic_shape %2212, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f16>
%int32_2056 = torch.constant.int 32
%2213 = torch.aten.mul.int %661, %int32_2056 : !torch.int, !torch.int -> !torch.int
%int2_2057 = torch.constant.int 2
%2214 = torch.aten.mul.int %2213, %int2_2057 : !torch.int, !torch.int -> !torch.int
%int32_2058 = torch.constant.int 32
%int8_2059 = torch.constant.int 8
%int128_2060 = torch.constant.int 128
%2215 = torch.prim.ListConstruct %2214, %int32_2058, %int8_2059, %int128_2060 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2216 = torch.aten.view %2212, %2215 : !torch.vtensor<[?,32,2,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f16>
torch.bind_symbolic_shape %2216, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f16>
%2217 = torch.prim.ListConstruct %2210 : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
%false_2061 = torch.constant.bool false
%2218 = torch.aten.index_put %2216, %2217, %2208, %false_2061 : !torch.vtensor<[?,32,8,128],f16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool -> !torch.vtensor<[?,32,8,128],f16>
torch.bind_symbolic_shape %2218, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f16>
%int32_2062 = torch.constant.int 32
%int2_2063 = torch.constant.int 2
%int32_2064 = torch.constant.int 32
%int8_2065 = torch.constant.int 8
%int128_2066 = torch.constant.int 128
%2219 = torch.prim.ListConstruct %661, %int32_2062, %int2_2063, %int32_2064, %int8_2065, %int128_2066 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2220 = torch.aten.view %2218, %2219 : !torch.vtensor<[?,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f16>
torch.bind_symbolic_shape %2220, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f16>
%int2097152_2067 = torch.constant.int 2097152
%2221 = torch.prim.ListConstruct %661, %int2097152_2067 : (!torch.int, !torch.int) -> !torch.list<int>
%2222 = torch.aten.view %2220, %2221 : !torch.vtensor<[?,32,2,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,2097152],f16>
torch.bind_symbolic_shape %2222, [%549], affine_map<()[s0] -> (s0, 2097152)> : !torch.vtensor<[?,2097152],f16>
%int32_2068 = torch.constant.int 32
%int2_2069 = torch.constant.int 2
%int32_2070 = torch.constant.int 32
%int8_2071 = torch.constant.int 8
%int128_2072 = torch.constant.int 128
%2223 = torch.prim.ListConstruct %661, %int32_2068, %int2_2069, %int32_2070, %int8_2071, %int128_2072 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2224 = torch.aten.view %2222, %2223 : !torch.vtensor<[?,2097152],f16>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f16>
torch.bind_symbolic_shape %2224, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f16>
%int32_2073 = torch.constant.int 32
%int8_2074 = torch.constant.int 8
%int128_2075 = torch.constant.int 128
%2225 = torch.prim.ListConstruct %2214, %int32_2073, %int8_2074, %int128_2075 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2226 = torch.aten.view %2224, %2225 : !torch.vtensor<[?,32,2,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f16>
torch.bind_symbolic_shape %2226, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f16>
%int1_2076 = torch.constant.int 1
%int32_2077 = torch.constant.int 32
%int8_2078 = torch.constant.int 8
%int128_2079 = torch.constant.int 128
%2227 = torch.prim.ListConstruct %int1_2076, %670, %int32_2077, %int8_2078, %int128_2079 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2228 = torch.aten.view %2202, %2227 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %2228, [%548], affine_map<()[s0] -> (1, s0, 32, 8, 128)> : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
%int32_2080 = torch.constant.int 32
%int8_2081 = torch.constant.int 8
%int128_2082 = torch.constant.int 128
%2229 = torch.prim.ListConstruct %670, %int32_2080, %int8_2081, %int128_2082 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2230 = torch.aten.view %2228, %2229 : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %2230, [%548], affine_map<()[s0] -> (s0, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
%int1_2083 = torch.constant.int 1
%int1_2084 = torch.constant.int 1
%2231 = torch.aten.add.Scalar %2204, %int1_2083, %int1_2084 : !torch.vtensor<[1,?],si64>, !torch.int, !torch.int -> !torch.vtensor<[1,?],si64>
torch.bind_symbolic_shape %2231, [%548], affine_map<()[s0] -> (1, s0)> : !torch.vtensor<[1,?],si64>
%2232 = torch.prim.ListConstruct %670 : (!torch.int) -> !torch.list<int>
%2233 = torch.aten.view %2231, %2232 : !torch.vtensor<[1,?],si64>, !torch.list<int> -> !torch.vtensor<[?],si64>
torch.bind_symbolic_shape %2233, [%548], affine_map<()[s0] -> (s0)> : !torch.vtensor<[?],si64>
%2234 = torch.prim.ListConstruct %2233 : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
%false_2085 = torch.constant.bool false
%2235 = torch.aten.index_put %2226, %2234, %2230, %false_2085 : !torch.vtensor<[?,32,8,128],f16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool -> !torch.vtensor<[?,32,8,128],f16>
torch.bind_symbolic_shape %2235, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f16>
%int32_2086 = torch.constant.int 32
%int2_2087 = torch.constant.int 2
%int32_2088 = torch.constant.int 32
%int8_2089 = torch.constant.int 8
%int128_2090 = torch.constant.int 128
%2236 = torch.prim.ListConstruct %661, %int32_2086, %int2_2087, %int32_2088, %int8_2089, %int128_2090 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2237 = torch.aten.view %2235, %2236 : !torch.vtensor<[?,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f16>
torch.bind_symbolic_shape %2237, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f16>
%int2097152_2091 = torch.constant.int 2097152
%2238 = torch.prim.ListConstruct %661, %int2097152_2091 : (!torch.int, !torch.int) -> !torch.list<int>
%2239 = torch.aten.view %2237, %2238 : !torch.vtensor<[?,32,2,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,2097152],f16>
torch.bind_symbolic_shape %2239, [%549], affine_map<()[s0] -> (s0, 2097152)> : !torch.vtensor<[?,2097152],f16>
%int-2_2092 = torch.constant.int -2
%2240 = torch.aten.unsqueeze %2199, %int-2_2092 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %2240, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 1, 128)> : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
%int1_2093 = torch.constant.int 1
%int8_2094 = torch.constant.int 8
%int4_2095 = torch.constant.int 4
%int128_2096 = torch.constant.int 128
%2241 = torch.prim.ListConstruct %int1_2093, %2181, %int8_2094, %int4_2095, %int128_2096 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%false_2097 = torch.constant.bool false
%2242 = torch.aten.expand %2240, %2241, %false_2097 : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %2242, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
%int0_2098 = torch.constant.int 0
%2243 = torch.aten.clone %2242, %int0_2098 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %2243, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
%int1_2099 = torch.constant.int 1
%int32_2100 = torch.constant.int 32
%int128_2101 = torch.constant.int 128
%2244 = torch.prim.ListConstruct %int1_2099, %2181, %int32_2100, %int128_2101 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2245 = torch.aten._unsafe_view %2243, %2244 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %2245, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
%int-2_2102 = torch.constant.int -2
%2246 = torch.aten.unsqueeze %2202, %int-2_2102 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %2246, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 1, 128)> : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
%int1_2103 = torch.constant.int 1
%2247 = torch.aten.size.int %2133, %int1_2103 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.int
%int1_2104 = torch.constant.int 1
%int8_2105 = torch.constant.int 8
%int4_2106 = torch.constant.int 4
%int128_2107 = torch.constant.int 128
%2248 = torch.prim.ListConstruct %int1_2104, %2247, %int8_2105, %int4_2106, %int128_2107 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%false_2108 = torch.constant.bool false
%2249 = torch.aten.expand %2246, %2248, %false_2108 : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %2249, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
%int0_2109 = torch.constant.int 0
%2250 = torch.aten.clone %2249, %int0_2109 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %2250, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
%int1_2110 = torch.constant.int 1
%int32_2111 = torch.constant.int 32
%int128_2112 = torch.constant.int 128
%2251 = torch.prim.ListConstruct %int1_2110, %2247, %int32_2111, %int128_2112 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2252 = torch.aten._unsafe_view %2250, %2251 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %2252, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
%int6_2113 = torch.constant.int 6
%2253 = torch.prims.convert_element_type %2245, %int6_2113 : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,32,128],f32>
torch.bind_symbolic_shape %2253, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
%2254 = torch.aten.mul.Tensor %2253, %127 : !torch.vtensor<[1,?,32,128],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,32,128],f32>
torch.bind_symbolic_shape %2254, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
%int15_2114 = torch.constant.int 15
%2255 = torch.prims.convert_element_type %2254, %int15_2114 : !torch.vtensor<[1,?,32,128],f32>, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
torch.bind_symbolic_shape %2255, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
%int6_2115 = torch.constant.int 6
%2256 = torch.prims.convert_element_type %2252, %int6_2115 : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,32,128],f32>
torch.bind_symbolic_shape %2256, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
%2257 = torch.aten.mul.Tensor %2256, %127 : !torch.vtensor<[1,?,32,128],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,32,128],f32>
torch.bind_symbolic_shape %2257, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
%int15_2116 = torch.constant.int 15
%2258 = torch.prims.convert_element_type %2257, %int15_2116 : !torch.vtensor<[1,?,32,128],f32>, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
torch.bind_symbolic_shape %2258, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
%int1_2117 = torch.constant.int 1
%int2_2118 = torch.constant.int 2
%2259 = torch.aten.transpose.int %2168, %int1_2117, %int2_2118 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
torch.bind_symbolic_shape %2259, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
%int1_2119 = torch.constant.int 1
%int2_2120 = torch.constant.int 2
%2260 = torch.aten.transpose.int %2255, %int1_2119, %int2_2120 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
torch.bind_symbolic_shape %2260, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
%int1_2121 = torch.constant.int 1
%int2_2122 = torch.constant.int 2
%2261 = torch.aten.transpose.int %2258, %int1_2121, %int2_2122 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
torch.bind_symbolic_shape %2261, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
%float0.000000e00_2123 = torch.constant.float 0.000000e+00
%true_2124 = torch.constant.bool true
%none_2125 = torch.constant.none
%none_2126 = torch.constant.none
%2262:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%2259, %2260, %2261, %float0.000000e00_2123, %true_2124, %none_2125, %none_2126) : (!torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,32,?,128],bf16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,32,?],f32>)
torch.bind_symbolic_shape %2262#0, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
%int1_2127 = torch.constant.int 1
%int2_2128 = torch.constant.int 2
%2263 = torch.aten.transpose.int %2262#0, %int1_2127, %int2_2128 : !torch.vtensor<[1,32,?,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
torch.bind_symbolic_shape %2263, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
%int1_2129 = torch.constant.int 1
%int4096_2130 = torch.constant.int 4096
%2264 = torch.prim.ListConstruct %int1_2129, %2153, %int4096_2130 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2265 = torch.aten.view %2263, %2264 : !torch.vtensor<[1,?,32,128],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,4096],bf16>
torch.bind_symbolic_shape %2265, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
%2266 = torch.aten.div.Tensor %2265, %128 : !torch.vtensor<[1,?,4096],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],bf16>
torch.bind_symbolic_shape %2266, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
%float-2.400000e02_2131 = torch.constant.float -2.400000e+02
%float2.400000e02_2132 = torch.constant.float 2.400000e+02
%2267 = torch.aten.clamp %2266, %float-2.400000e02_2131, %float2.400000e02_2132 : !torch.vtensor<[1,?,4096],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],bf16>
torch.bind_symbolic_shape %2267, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
%int26_2133 = torch.constant.int 26
%2268 = torch.prims.convert_element_type %2267, %int26_2133 : !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %2268, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
%int-2_2134 = torch.constant.int -2
%int-1_2135 = torch.constant.int -1
%2269 = torch.aten.transpose.int %129, %int-2_2134, %int-1_2135 : !torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%int4096_2136 = torch.constant.int 4096
%2270 = torch.prim.ListConstruct %2153, %int4096_2136 : (!torch.int, !torch.int) -> !torch.list<int>
%2271 = torch.aten.view %2268, %2270 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %2271, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
%2272 = torch.aten.mm %2271, %2269 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %2272, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
%int1_2137 = torch.constant.int 1
%int4096_2138 = torch.constant.int 4096
%2273 = torch.prim.ListConstruct %int1_2137, %2153, %int4096_2138 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2274 = torch.aten.view %2272, %2273 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %2274, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
%int15_2139 = torch.constant.int 15
%2275 = torch.prims.convert_element_type %2274, %int15_2139 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
torch.bind_symbolic_shape %2275, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
%int1_2140 = torch.constant.int 1
%2276 = torch.aten.add.Tensor %2097, %2275, %int1_2140 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %2276, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%int2_2141 = torch.constant.int 2
%2277 = torch.aten.pow.Tensor_Scalar %2276, %int2_2141 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %2277, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%int-1_2142 = torch.constant.int -1
%2278 = torch.prim.ListConstruct %int-1_2142 : (!torch.int) -> !torch.list<int>
%true_2143 = torch.constant.bool true
%none_2144 = torch.constant.none
%2279 = torch.aten.mean.dim %2277, %2278, %true_2143, %none_2144 : !torch.vtensor<[1,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,?,1],f32>
torch.bind_symbolic_shape %2279, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
%float1.000000e-05_2145 = torch.constant.float 1.000000e-05
%int1_2146 = torch.constant.int 1
%2280 = torch.aten.add.Scalar %2279, %float1.000000e-05_2145, %int1_2146 : !torch.vtensor<[1,?,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,?,1],f32>
torch.bind_symbolic_shape %2280, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
%2281 = torch.aten.rsqrt %2280 : !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,1],f32>
torch.bind_symbolic_shape %2281, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
%2282 = torch.aten.mul.Tensor %2276, %2281 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %2282, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%2283 = torch.aten.mul.Tensor %130, %2282 : !torch.vtensor<[4096],bf16>, !torch.vtensor<[1,?,4096],f32> -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %2283, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%2284 = torch.aten.div.Tensor %2283, %131 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %2284, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%float-2.400000e02_2147 = torch.constant.float -2.400000e+02
%float2.400000e02_2148 = torch.constant.float 2.400000e+02
%2285 = torch.aten.clamp %2284, %float-2.400000e02_2147, %float2.400000e02_2148 : !torch.vtensor<[1,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %2285, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%int26_2149 = torch.constant.int 26
%2286 = torch.prims.convert_element_type %2285, %int26_2149 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %2286, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
%int-2_2150 = torch.constant.int -2
%int-1_2151 = torch.constant.int -1
%2287 = torch.aten.transpose.int %132, %int-2_2150, %int-1_2151 : !torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
%int4096_2152 = torch.constant.int 4096
%2288 = torch.prim.ListConstruct %566, %int4096_2152 : (!torch.int, !torch.int) -> !torch.list<int>
%2289 = torch.aten.view %2286, %2288 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %2289, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
%2290 = torch.aten.mm %2289, %2287 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,14336],f8E4M3FNUZ> -> !torch.vtensor<[?,14336],f8E4M3FNUZ>
torch.bind_symbolic_shape %2290, [%548], affine_map<()[s0] -> (s0 * 32, 14336)> : !torch.vtensor<[?,14336],f8E4M3FNUZ>
%int1_2153 = torch.constant.int 1
%int14336_2154 = torch.constant.int 14336
%2291 = torch.prim.ListConstruct %int1_2153, %566, %int14336_2154 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2292 = torch.aten.view %2290, %2291 : !torch.vtensor<[?,14336],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
torch.bind_symbolic_shape %2292, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
%int15_2155 = torch.constant.int 15
%2293 = torch.prims.convert_element_type %2292, %int15_2155 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,14336],bf16>
torch.bind_symbolic_shape %2293, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
%2294 = torch.aten.silu %2293 : !torch.vtensor<[1,?,14336],bf16> -> !torch.vtensor<[1,?,14336],bf16>
torch.bind_symbolic_shape %2294, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
%2295 = torch.aten.div.Tensor %2283, %133 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %2295, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%float-2.400000e02_2156 = torch.constant.float -2.400000e+02
%float2.400000e02_2157 = torch.constant.float 2.400000e+02
%2296 = torch.aten.clamp %2295, %float-2.400000e02_2156, %float2.400000e02_2157 : !torch.vtensor<[1,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %2296, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%int26_2158 = torch.constant.int 26
%2297 = torch.prims.convert_element_type %2296, %int26_2158 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %2297, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
%int-2_2159 = torch.constant.int -2
%int-1_2160 = torch.constant.int -1
%2298 = torch.aten.transpose.int %134, %int-2_2159, %int-1_2160 : !torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
%int4096_2161 = torch.constant.int 4096
%2299 = torch.prim.ListConstruct %566, %int4096_2161 : (!torch.int, !torch.int) -> !torch.list<int>
%2300 = torch.aten.view %2297, %2299 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %2300, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
%2301 = torch.aten.mm %2300, %2298 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,14336],f8E4M3FNUZ> -> !torch.vtensor<[?,14336],f8E4M3FNUZ>
torch.bind_symbolic_shape %2301, [%548], affine_map<()[s0] -> (s0 * 32, 14336)> : !torch.vtensor<[?,14336],f8E4M3FNUZ>
%int1_2162 = torch.constant.int 1
%int14336_2163 = torch.constant.int 14336
%2302 = torch.prim.ListConstruct %int1_2162, %566, %int14336_2163 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2303 = torch.aten.view %2301, %2302 : !torch.vtensor<[?,14336],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
torch.bind_symbolic_shape %2303, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
%int15_2164 = torch.constant.int 15
%2304 = torch.prims.convert_element_type %2303, %int15_2164 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,14336],bf16>
torch.bind_symbolic_shape %2304, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
%2305 = torch.aten.mul.Tensor %2294, %2304 : !torch.vtensor<[1,?,14336],bf16>, !torch.vtensor<[1,?,14336],bf16> -> !torch.vtensor<[1,?,14336],bf16>
torch.bind_symbolic_shape %2305, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
%2306 = torch.aten.div.Tensor %2305, %135 : !torch.vtensor<[1,?,14336],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,14336],bf16>
torch.bind_symbolic_shape %2306, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
%float-2.400000e02_2165 = torch.constant.float -2.400000e+02
%float2.400000e02_2166 = torch.constant.float 2.400000e+02
%2307 = torch.aten.clamp %2306, %float-2.400000e02_2165, %float2.400000e02_2166 : !torch.vtensor<[1,?,14336],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,14336],bf16>
torch.bind_symbolic_shape %2307, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
%int26_2167 = torch.constant.int 26
%2308 = torch.prims.convert_element_type %2307, %int26_2167 : !torch.vtensor<[1,?,14336],bf16>, !torch.int -> !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
torch.bind_symbolic_shape %2308, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
%int-2_2168 = torch.constant.int -2
%int-1_2169 = torch.constant.int -1
%2309 = torch.aten.transpose.int %136, %int-2_2168, %int-1_2169 : !torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%int1_2170 = torch.constant.int 1
%2310 = torch.aten.size.int %2292, %int1_2170 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.int -> !torch.int
%int14336_2171 = torch.constant.int 14336
%2311 = torch.prim.ListConstruct %2310, %int14336_2171 : (!torch.int, !torch.int) -> !torch.list<int>
%2312 = torch.aten.view %2308, %2311 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,14336],f8E4M3FNUZ>
torch.bind_symbolic_shape %2312, [%548], affine_map<()[s0] -> (s0 * 32, 14336)> : !torch.vtensor<[?,14336],f8E4M3FNUZ>
%2313 = torch.aten.mm %2312, %2309 : !torch.vtensor<[?,14336],f8E4M3FNUZ>, !torch.vtensor<[14336,4096],f8E4M3FNUZ> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %2313, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
%int1_2172 = torch.constant.int 1
%int4096_2173 = torch.constant.int 4096
%2314 = torch.prim.ListConstruct %int1_2172, %2310, %int4096_2173 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2315 = torch.aten.view %2313, %2314 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %2315, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
%int15_2174 = torch.constant.int 15
%2316 = torch.prims.convert_element_type %2315, %int15_2174 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
torch.bind_symbolic_shape %2316, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
%int1_2175 = torch.constant.int 1
%2317 = torch.aten.add.Tensor %2276, %2316, %int1_2175 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %2317, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%int2_2176 = torch.constant.int 2
%2318 = torch.aten.pow.Tensor_Scalar %2317, %int2_2176 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %2318, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%int-1_2177 = torch.constant.int -1
%2319 = torch.prim.ListConstruct %int-1_2177 : (!torch.int) -> !torch.list<int>
%true_2178 = torch.constant.bool true
%none_2179 = torch.constant.none
%2320 = torch.aten.mean.dim %2318, %2319, %true_2178, %none_2179 : !torch.vtensor<[1,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,?,1],f32>
torch.bind_symbolic_shape %2320, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
%float1.000000e-05_2180 = torch.constant.float 1.000000e-05
%int1_2181 = torch.constant.int 1
%2321 = torch.aten.add.Scalar %2320, %float1.000000e-05_2180, %int1_2181 : !torch.vtensor<[1,?,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,?,1],f32>
torch.bind_symbolic_shape %2321, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
%2322 = torch.aten.rsqrt %2321 : !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,1],f32>
torch.bind_symbolic_shape %2322, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
%2323 = torch.aten.mul.Tensor %2317, %2322 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %2323, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%2324 = torch.aten.mul.Tensor %137, %2323 : !torch.vtensor<[4096],bf16>, !torch.vtensor<[1,?,4096],f32> -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %2324, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%2325 = torch.aten.div.Tensor %2324, %138 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %2325, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%float-2.400000e02_2182 = torch.constant.float -2.400000e+02
%float2.400000e02_2183 = torch.constant.float 2.400000e+02
%2326 = torch.aten.clamp %2325, %float-2.400000e02_2182, %float2.400000e02_2183 : !torch.vtensor<[1,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %2326, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%int26_2184 = torch.constant.int 26
%2327 = torch.prims.convert_element_type %2326, %int26_2184 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %2327, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
%int-2_2185 = torch.constant.int -2
%int-1_2186 = torch.constant.int -1
%2328 = torch.aten.transpose.int %139, %int-2_2185, %int-1_2186 : !torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%int4096_2187 = torch.constant.int 4096
%2329 = torch.prim.ListConstruct %566, %int4096_2187 : (!torch.int, !torch.int) -> !torch.list<int>
%2330 = torch.aten.view %2327, %2329 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %2330, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
%2331 = torch.aten.mm %2330, %2328 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %2331, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
%int1_2188 = torch.constant.int 1
%int4096_2189 = torch.constant.int 4096
%2332 = torch.prim.ListConstruct %int1_2188, %566, %int4096_2189 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2333 = torch.aten.view %2331, %2332 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %2333, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
%int15_2190 = torch.constant.int 15
%2334 = torch.prims.convert_element_type %2333, %int15_2190 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
torch.bind_symbolic_shape %2334, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
%2335 = torch.aten.div.Tensor %2324, %140 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %2335, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%float-2.400000e02_2191 = torch.constant.float -2.400000e+02
%float2.400000e02_2192 = torch.constant.float 2.400000e+02
%2336 = torch.aten.clamp %2335, %float-2.400000e02_2191, %float2.400000e02_2192 : !torch.vtensor<[1,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %2336, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%int26_2193 = torch.constant.int 26
%2337 = torch.prims.convert_element_type %2336, %int26_2193 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %2337, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
%int-2_2194 = torch.constant.int -2
%int-1_2195 = torch.constant.int -1
%2338 = torch.aten.transpose.int %141, %int-2_2194, %int-1_2195 : !torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,1024],f8E4M3FNUZ>
%int4096_2196 = torch.constant.int 4096
%2339 = torch.prim.ListConstruct %566, %int4096_2196 : (!torch.int, !torch.int) -> !torch.list<int>
%2340 = torch.aten.view %2337, %2339 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %2340, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
%2341 = torch.aten.mm %2340, %2338 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,1024],f8E4M3FNUZ> -> !torch.vtensor<[?,1024],f8E4M3FNUZ>
torch.bind_symbolic_shape %2341, [%548], affine_map<()[s0] -> (s0 * 32, 1024)> : !torch.vtensor<[?,1024],f8E4M3FNUZ>
%int1_2197 = torch.constant.int 1
%int1024_2198 = torch.constant.int 1024
%2342 = torch.prim.ListConstruct %int1_2197, %566, %int1024_2198 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2343 = torch.aten.view %2341, %2342 : !torch.vtensor<[?,1024],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
torch.bind_symbolic_shape %2343, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
%int15_2199 = torch.constant.int 15
%2344 = torch.prims.convert_element_type %2343, %int15_2199 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,1024],bf16>
torch.bind_symbolic_shape %2344, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],bf16>
%2345 = torch.aten.div.Tensor %2324, %142 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %2345, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%float-2.400000e02_2200 = torch.constant.float -2.400000e+02
%float2.400000e02_2201 = torch.constant.float 2.400000e+02
%2346 = torch.aten.clamp %2345, %float-2.400000e02_2200, %float2.400000e02_2201 : !torch.vtensor<[1,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %2346, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%int26_2202 = torch.constant.int 26
%2347 = torch.prims.convert_element_type %2346, %int26_2202 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %2347, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
%int-2_2203 = torch.constant.int -2
%int-1_2204 = torch.constant.int -1
%2348 = torch.aten.transpose.int %143, %int-2_2203, %int-1_2204 : !torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,1024],f8E4M3FNUZ>
%int4096_2205 = torch.constant.int 4096
%2349 = torch.prim.ListConstruct %566, %int4096_2205 : (!torch.int, !torch.int) -> !torch.list<int>
%2350 = torch.aten.view %2347, %2349 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %2350, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
%2351 = torch.aten.mm %2350, %2348 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,1024],f8E4M3FNUZ> -> !torch.vtensor<[?,1024],f8E4M3FNUZ>
torch.bind_symbolic_shape %2351, [%548], affine_map<()[s0] -> (s0 * 32, 1024)> : !torch.vtensor<[?,1024],f8E4M3FNUZ>
%int1_2206 = torch.constant.int 1
%int1024_2207 = torch.constant.int 1024
%2352 = torch.prim.ListConstruct %int1_2206, %566, %int1024_2207 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2353 = torch.aten.view %2351, %2352 : !torch.vtensor<[?,1024],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
torch.bind_symbolic_shape %2353, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
%int15_2208 = torch.constant.int 15
%2354 = torch.prims.convert_element_type %2353, %int15_2208 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,1024],bf16>
torch.bind_symbolic_shape %2354, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],bf16>
%int1_2209 = torch.constant.int 1
%int32_2210 = torch.constant.int 32
%int128_2211 = torch.constant.int 128
%2355 = torch.prim.ListConstruct %int1_2209, %566, %int32_2210, %int128_2211 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2356 = torch.aten.view %2334, %2355 : !torch.vtensor<[1,?,4096],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,32,128],bf16>
torch.bind_symbolic_shape %2356, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
%int1_2212 = torch.constant.int 1
%int8_2213 = torch.constant.int 8
%int128_2214 = torch.constant.int 128
%2357 = torch.prim.ListConstruct %int1_2212, %566, %int8_2213, %int128_2214 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2358 = torch.aten.view %2344, %2357 : !torch.vtensor<[1,?,1024],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,8,128],bf16>
torch.bind_symbolic_shape %2358, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
%int1_2215 = torch.constant.int 1
%int8_2216 = torch.constant.int 8
%int128_2217 = torch.constant.int 128
%2359 = torch.prim.ListConstruct %int1_2215, %566, %int8_2216, %int128_2217 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2360 = torch.aten.view %2354, %2359 : !torch.vtensor<[1,?,1024],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,8,128],bf16>
torch.bind_symbolic_shape %2360, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
%int131072_2218 = torch.constant.int 131072
%none_2219 = torch.constant.none
%none_2220 = torch.constant.none
%cpu_2221 = torch.constant.device "cpu"
%false_2222 = torch.constant.bool false
%2361 = torch.aten.arange %int131072_2218, %none_2219, %none_2220, %cpu_2221, %false_2222 : !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[131072],si64>
%int0_2223 = torch.constant.int 0
%int128_2224 = torch.constant.int 128
%none_2225 = torch.constant.none
%none_2226 = torch.constant.none
%cpu_2227 = torch.constant.device "cpu"
%false_2228 = torch.constant.bool false
%2362 = torch.aten.arange.start %int0_2223, %int128_2224, %none_2225, %none_2226, %cpu_2227, %false_2228 : !torch.int, !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[128],si64>
%int2_2229 = torch.constant.int 2
%2363 = torch.aten.floor_divide.Scalar %2362, %int2_2229 : !torch.vtensor<[128],si64>, !torch.int -> !torch.vtensor<[128],si64>
%int6_2230 = torch.constant.int 6
%2364 = torch.prims.convert_element_type %2363, %int6_2230 : !torch.vtensor<[128],si64>, !torch.int -> !torch.vtensor<[128],f32>
%int128_2231 = torch.constant.int 128
%2365 = torch.aten.div.Scalar %2364, %int128_2231 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[128],f32>
%float2.000000e00_2232 = torch.constant.float 2.000000e+00
%2366 = torch.aten.mul.Scalar %2365, %float2.000000e00_2232 : !torch.vtensor<[128],f32>, !torch.float -> !torch.vtensor<[128],f32>
%float5.000000e05_2233 = torch.constant.float 5.000000e+05
%2367 = torch.aten.pow.Scalar %float5.000000e05_2233, %2366 : !torch.float, !torch.vtensor<[128],f32> -> !torch.vtensor<[128],f32>
%2368 = torch.aten.reciprocal %2367 : !torch.vtensor<[128],f32> -> !torch.vtensor<[128],f32>
%float1.000000e00_2234 = torch.constant.float 1.000000e+00
%2369 = torch.aten.mul.Scalar %2368, %float1.000000e00_2234 : !torch.vtensor<[128],f32>, !torch.float -> !torch.vtensor<[128],f32>
%int131072_2235 = torch.constant.int 131072
%int1_2236 = torch.constant.int 1
%2370 = torch.prim.ListConstruct %int131072_2235, %int1_2236 : (!torch.int, !torch.int) -> !torch.list<int>
%2371 = torch.aten.view %2361, %2370 : !torch.vtensor<[131072],si64>, !torch.list<int> -> !torch.vtensor<[131072,1],si64>
%2372 = torch.aten.mul.Tensor %2371, %2369 : !torch.vtensor<[131072,1],si64>, !torch.vtensor<[128],f32> -> !torch.vtensor<[131072,128],f32>
%int1_2237 = torch.constant.int 1
%2373 = torch.aten.size.int %2333, %int1_2237 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.int
%int0_2238 = torch.constant.int 0
%2374 = torch.aten.add.int %int0_2238, %2373 : !torch.int, !torch.int -> !torch.int
%int0_2239 = torch.constant.int 0
%int0_2240 = torch.constant.int 0
%int1_2241 = torch.constant.int 1
%2375 = torch.aten.slice.Tensor %2372, %int0_2239, %int0_2240, %2374, %int1_2241 : !torch.vtensor<[131072,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
torch.bind_symbolic_shape %2375, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
%int1_2242 = torch.constant.int 1
%int0_2243 = torch.constant.int 0
%int9223372036854775807_2244 = torch.constant.int 9223372036854775807
%int1_2245 = torch.constant.int 1
%2376 = torch.aten.slice.Tensor %2375, %int1_2242, %int0_2243, %int9223372036854775807_2244, %int1_2245 : !torch.vtensor<[?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
torch.bind_symbolic_shape %2376, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
%int1_2246 = torch.constant.int 1
%int0_2247 = torch.constant.int 0
%int9223372036854775807_2248 = torch.constant.int 9223372036854775807
%int1_2249 = torch.constant.int 1
%2377 = torch.aten.slice.Tensor %2376, %int1_2246, %int0_2247, %int9223372036854775807_2248, %int1_2249 : !torch.vtensor<[?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
torch.bind_symbolic_shape %2377, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
%int0_2250 = torch.constant.int 0
%2378 = torch.aten.unsqueeze %2377, %int0_2250 : !torch.vtensor<[?,128],f32>, !torch.int -> !torch.vtensor<[1,?,128],f32>
torch.bind_symbolic_shape %2378, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
%int1_2251 = torch.constant.int 1
%int0_2252 = torch.constant.int 0
%int9223372036854775807_2253 = torch.constant.int 9223372036854775807
%int1_2254 = torch.constant.int 1
%2379 = torch.aten.slice.Tensor %2378, %int1_2251, %int0_2252, %int9223372036854775807_2253, %int1_2254 : !torch.vtensor<[1,?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],f32>
torch.bind_symbolic_shape %2379, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
%int2_2255 = torch.constant.int 2
%int0_2256 = torch.constant.int 0
%int9223372036854775807_2257 = torch.constant.int 9223372036854775807
%int1_2258 = torch.constant.int 1
%2380 = torch.aten.slice.Tensor %2379, %int2_2255, %int0_2256, %int9223372036854775807_2257, %int1_2258 : !torch.vtensor<[1,?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],f32>
torch.bind_symbolic_shape %2380, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
%int1_2259 = torch.constant.int 1
%int1_2260 = torch.constant.int 1
%int1_2261 = torch.constant.int 1
%2381 = torch.prim.ListConstruct %int1_2259, %int1_2260, %int1_2261 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2382 = torch.aten.repeat %2380, %2381 : !torch.vtensor<[1,?,128],f32>, !torch.list<int> -> !torch.vtensor<[1,?,128],f32>
torch.bind_symbolic_shape %2382, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
%int6_2262 = torch.constant.int 6
%2383 = torch.prims.convert_element_type %2356, %int6_2262 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int -> !torch.vtensor<[1,?,32,128],f32>
torch.bind_symbolic_shape %2383, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
%2384 = torch_c.to_builtin_tensor %2383 : !torch.vtensor<[1,?,32,128],f32> -> tensor<1x?x32x128xf32>
%2385 = torch_c.to_builtin_tensor %2382 : !torch.vtensor<[1,?,128],f32> -> tensor<1x?x128xf32>
%2386 = util.call @sharktank_rotary_embedding_1_D_32_128_f32(%2384, %2385) : (tensor<1x?x32x128xf32>, tensor<1x?x128xf32>) -> tensor<1x?x32x128xf32>
%2387 = torch_c.from_builtin_tensor %2386 : tensor<1x?x32x128xf32> -> !torch.vtensor<[1,?,32,128],f32>
torch.bind_symbolic_shape %2387, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
%int15_2263 = torch.constant.int 15
%2388 = torch.prims.convert_element_type %2387, %int15_2263 : !torch.vtensor<[1,?,32,128],f32>, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
torch.bind_symbolic_shape %2388, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
%int131072_2264 = torch.constant.int 131072
%none_2265 = torch.constant.none
%none_2266 = torch.constant.none
%cpu_2267 = torch.constant.device "cpu"
%false_2268 = torch.constant.bool false
%2389 = torch.aten.arange %int131072_2264, %none_2265, %none_2266, %cpu_2267, %false_2268 : !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[131072],si64>
%int0_2269 = torch.constant.int 0
%int128_2270 = torch.constant.int 128
%none_2271 = torch.constant.none
%none_2272 = torch.constant.none
%cpu_2273 = torch.constant.device "cpu"
%false_2274 = torch.constant.bool false
%2390 = torch.aten.arange.start %int0_2269, %int128_2270, %none_2271, %none_2272, %cpu_2273, %false_2274 : !torch.int, !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[128],si64>
%int2_2275 = torch.constant.int 2
%2391 = torch.aten.floor_divide.Scalar %2390, %int2_2275 : !torch.vtensor<[128],si64>, !torch.int -> !torch.vtensor<[128],si64>
%int6_2276 = torch.constant.int 6
%2392 = torch.prims.convert_element_type %2391, %int6_2276 : !torch.vtensor<[128],si64>, !torch.int -> !torch.vtensor<[128],f32>
%int128_2277 = torch.constant.int 128
%2393 = torch.aten.div.Scalar %2392, %int128_2277 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[128],f32>
%float2.000000e00_2278 = torch.constant.float 2.000000e+00
%2394 = torch.aten.mul.Scalar %2393, %float2.000000e00_2278 : !torch.vtensor<[128],f32>, !torch.float -> !torch.vtensor<[128],f32>
%float5.000000e05_2279 = torch.constant.float 5.000000e+05
%2395 = torch.aten.pow.Scalar %float5.000000e05_2279, %2394 : !torch.float, !torch.vtensor<[128],f32> -> !torch.vtensor<[128],f32>
%2396 = torch.aten.reciprocal %2395 : !torch.vtensor<[128],f32> -> !torch.vtensor<[128],f32>
%float1.000000e00_2280 = torch.constant.float 1.000000e+00
%2397 = torch.aten.mul.Scalar %2396, %float1.000000e00_2280 : !torch.vtensor<[128],f32>, !torch.float -> !torch.vtensor<[128],f32>
%int131072_2281 = torch.constant.int 131072
%int1_2282 = torch.constant.int 1
%2398 = torch.prim.ListConstruct %int131072_2281, %int1_2282 : (!torch.int, !torch.int) -> !torch.list<int>
%2399 = torch.aten.view %2389, %2398 : !torch.vtensor<[131072],si64>, !torch.list<int> -> !torch.vtensor<[131072,1],si64>
%2400 = torch.aten.mul.Tensor %2399, %2397 : !torch.vtensor<[131072,1],si64>, !torch.vtensor<[128],f32> -> !torch.vtensor<[131072,128],f32>
%int1_2283 = torch.constant.int 1
%2401 = torch.aten.size.int %2343, %int1_2283 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.int
%int0_2284 = torch.constant.int 0
%2402 = torch.aten.add.int %int0_2284, %2401 : !torch.int, !torch.int -> !torch.int
%int0_2285 = torch.constant.int 0
%int0_2286 = torch.constant.int 0
%int1_2287 = torch.constant.int 1
%2403 = torch.aten.slice.Tensor %2400, %int0_2285, %int0_2286, %2402, %int1_2287 : !torch.vtensor<[131072,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
torch.bind_symbolic_shape %2403, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
%int1_2288 = torch.constant.int 1
%int0_2289 = torch.constant.int 0
%int9223372036854775807_2290 = torch.constant.int 9223372036854775807
%int1_2291 = torch.constant.int 1
%2404 = torch.aten.slice.Tensor %2403, %int1_2288, %int0_2289, %int9223372036854775807_2290, %int1_2291 : !torch.vtensor<[?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
torch.bind_symbolic_shape %2404, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
%int1_2292 = torch.constant.int 1
%int0_2293 = torch.constant.int 0
%int9223372036854775807_2294 = torch.constant.int 9223372036854775807
%int1_2295 = torch.constant.int 1
%2405 = torch.aten.slice.Tensor %2404, %int1_2292, %int0_2293, %int9223372036854775807_2294, %int1_2295 : !torch.vtensor<[?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
torch.bind_symbolic_shape %2405, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
%int0_2296 = torch.constant.int 0
%2406 = torch.aten.unsqueeze %2405, %int0_2296 : !torch.vtensor<[?,128],f32>, !torch.int -> !torch.vtensor<[1,?,128],f32>
torch.bind_symbolic_shape %2406, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
%int1_2297 = torch.constant.int 1
%int0_2298 = torch.constant.int 0
%int9223372036854775807_2299 = torch.constant.int 9223372036854775807
%int1_2300 = torch.constant.int 1
%2407 = torch.aten.slice.Tensor %2406, %int1_2297, %int0_2298, %int9223372036854775807_2299, %int1_2300 : !torch.vtensor<[1,?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],f32>
torch.bind_symbolic_shape %2407, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
%int2_2301 = torch.constant.int 2
%int0_2302 = torch.constant.int 0
%int9223372036854775807_2303 = torch.constant.int 9223372036854775807
%int1_2304 = torch.constant.int 1
%2408 = torch.aten.slice.Tensor %2407, %int2_2301, %int0_2302, %int9223372036854775807_2303, %int1_2304 : !torch.vtensor<[1,?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],f32>
torch.bind_symbolic_shape %2408, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
%int1_2305 = torch.constant.int 1
%int1_2306 = torch.constant.int 1
%int1_2307 = torch.constant.int 1
%2409 = torch.prim.ListConstruct %int1_2305, %int1_2306, %int1_2307 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2410 = torch.aten.repeat %2408, %2409 : !torch.vtensor<[1,?,128],f32>, !torch.list<int> -> !torch.vtensor<[1,?,128],f32>
torch.bind_symbolic_shape %2410, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
%int6_2308 = torch.constant.int 6
%2411 = torch.prims.convert_element_type %2358, %int6_2308 : !torch.vtensor<[1,?,8,128],bf16>, !torch.int -> !torch.vtensor<[1,?,8,128],f32>
torch.bind_symbolic_shape %2411, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f32>
%2412 = torch_c.to_builtin_tensor %2411 : !torch.vtensor<[1,?,8,128],f32> -> tensor<1x?x8x128xf32>
%2413 = torch_c.to_builtin_tensor %2410 : !torch.vtensor<[1,?,128],f32> -> tensor<1x?x128xf32>
%2414 = util.call @sharktank_rotary_embedding_1_D_8_128_f32(%2412, %2413) : (tensor<1x?x8x128xf32>, tensor<1x?x128xf32>) -> tensor<1x?x8x128xf32>
%2415 = torch_c.from_builtin_tensor %2414 : tensor<1x?x8x128xf32> -> !torch.vtensor<[1,?,8,128],f32>
torch.bind_symbolic_shape %2415, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f32>
%int15_2309 = torch.constant.int 15
%2416 = torch.prims.convert_element_type %2415, %int15_2309 : !torch.vtensor<[1,?,8,128],f32>, !torch.int -> !torch.vtensor<[1,?,8,128],bf16>
torch.bind_symbolic_shape %2416, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
%2417 = torch.aten.div.Tensor %2416, %144 : !torch.vtensor<[1,?,8,128],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,8,128],bf16>
torch.bind_symbolic_shape %2417, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
%float-2.400000e02_2310 = torch.constant.float -2.400000e+02
%float2.400000e02_2311 = torch.constant.float 2.400000e+02
%2418 = torch.aten.clamp %2417, %float-2.400000e02_2310, %float2.400000e02_2311 : !torch.vtensor<[1,?,8,128],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,8,128],bf16>
torch.bind_symbolic_shape %2418, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
%int26_2312 = torch.constant.int 26
%2419 = torch.prims.convert_element_type %2418, %int26_2312 : !torch.vtensor<[1,?,8,128],bf16>, !torch.int -> !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %2419, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
%2420 = torch.aten.div.Tensor %2360, %144 : !torch.vtensor<[1,?,8,128],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,8,128],bf16>
torch.bind_symbolic_shape %2420, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
%float-2.400000e02_2313 = torch.constant.float -2.400000e+02
%float2.400000e02_2314 = torch.constant.float 2.400000e+02
%2421 = torch.aten.clamp %2420, %float-2.400000e02_2313, %float2.400000e02_2314 : !torch.vtensor<[1,?,8,128],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,8,128],bf16>
torch.bind_symbolic_shape %2421, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
%int26_2315 = torch.constant.int 26
%2422 = torch.prims.convert_element_type %2421, %int26_2315 : !torch.vtensor<[1,?,8,128],bf16>, !torch.int -> !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %2422, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
%int64_2316 = torch.constant.int 64
%2423 = torch.aten.mul.Scalar %arg2, %int64_2316 : !torch.vtensor<[1,?],si64>, !torch.int -> !torch.vtensor<[1,?],si64>
torch.bind_symbolic_shape %2423, [%548], affine_map<()[s0] -> (1, s0)> : !torch.vtensor<[1,?],si64>
%int16 = torch.constant.int 16
%int1_2317 = torch.constant.int 1
%2424 = torch.aten.add.Scalar %2423, %int16, %int1_2317 : !torch.vtensor<[1,?],si64>, !torch.int, !torch.int -> !torch.vtensor<[1,?],si64>
torch.bind_symbolic_shape %2424, [%548], affine_map<()[s0] -> (1, s0)> : !torch.vtensor<[1,?],si64>
%int1_2318 = torch.constant.int 1
%int32_2319 = torch.constant.int 32
%int8_2320 = torch.constant.int 8
%int128_2321 = torch.constant.int 128
%2425 = torch.prim.ListConstruct %int1_2318, %670, %int32_2319, %int8_2320, %int128_2321 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2426 = torch.aten.view %2419, %2425 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %2426, [%548], affine_map<()[s0] -> (1, s0, 32, 8, 128)> : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
%int32_2322 = torch.constant.int 32
%int8_2323 = torch.constant.int 8
%int128_2324 = torch.constant.int 128
%2427 = torch.prim.ListConstruct %670, %int32_2322, %int8_2323, %int128_2324 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2428 = torch.aten.view %2426, %2427 : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %2428, [%548], affine_map<()[s0] -> (s0, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
%2429 = torch.prim.ListConstruct %670 : (!torch.int) -> !torch.list<int>
%2430 = torch.aten.view %2424, %2429 : !torch.vtensor<[1,?],si64>, !torch.list<int> -> !torch.vtensor<[?],si64>
torch.bind_symbolic_shape %2430, [%548], affine_map<()[s0] -> (s0)> : !torch.vtensor<[?],si64>
%int32_2325 = torch.constant.int 32
%int2_2326 = torch.constant.int 2
%int32_2327 = torch.constant.int 32
%int8_2328 = torch.constant.int 8
%int128_2329 = torch.constant.int 128
%2431 = torch.prim.ListConstruct %661, %int32_2325, %int2_2326, %int32_2327, %int8_2328, %int128_2329 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2432 = torch.aten.view %2239, %2431 : !torch.vtensor<[?,2097152],f16>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f16>
torch.bind_symbolic_shape %2432, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f16>
%int32_2330 = torch.constant.int 32
%2433 = torch.aten.mul.int %661, %int32_2330 : !torch.int, !torch.int -> !torch.int
%int2_2331 = torch.constant.int 2
%2434 = torch.aten.mul.int %2433, %int2_2331 : !torch.int, !torch.int -> !torch.int
%int32_2332 = torch.constant.int 32
%int8_2333 = torch.constant.int 8
%int128_2334 = torch.constant.int 128
%2435 = torch.prim.ListConstruct %2434, %int32_2332, %int8_2333, %int128_2334 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2436 = torch.aten.view %2432, %2435 : !torch.vtensor<[?,32,2,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f16>
torch.bind_symbolic_shape %2436, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f16>
%2437 = torch.prim.ListConstruct %2430 : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
%false_2335 = torch.constant.bool false
%2438 = torch.aten.index_put %2436, %2437, %2428, %false_2335 : !torch.vtensor<[?,32,8,128],f16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool -> !torch.vtensor<[?,32,8,128],f16>
torch.bind_symbolic_shape %2438, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f16>
%int32_2336 = torch.constant.int 32
%int2_2337 = torch.constant.int 2
%int32_2338 = torch.constant.int 32
%int8_2339 = torch.constant.int 8
%int128_2340 = torch.constant.int 128
%2439 = torch.prim.ListConstruct %661, %int32_2336, %int2_2337, %int32_2338, %int8_2339, %int128_2340 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2440 = torch.aten.view %2438, %2439 : !torch.vtensor<[?,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f16>
torch.bind_symbolic_shape %2440, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f16>
%int2097152_2341 = torch.constant.int 2097152
%2441 = torch.prim.ListConstruct %661, %int2097152_2341 : (!torch.int, !torch.int) -> !torch.list<int>
%2442 = torch.aten.view %2440, %2441 : !torch.vtensor<[?,32,2,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,2097152],f16>
torch.bind_symbolic_shape %2442, [%549], affine_map<()[s0] -> (s0, 2097152)> : !torch.vtensor<[?,2097152],f16>
%int32_2342 = torch.constant.int 32
%int2_2343 = torch.constant.int 2
%int32_2344 = torch.constant.int 32
%int8_2345 = torch.constant.int 8
%int128_2346 = torch.constant.int 128
%2443 = torch.prim.ListConstruct %661, %int32_2342, %int2_2343, %int32_2344, %int8_2345, %int128_2346 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2444 = torch.aten.view %2442, %2443 : !torch.vtensor<[?,2097152],f16>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f16>
torch.bind_symbolic_shape %2444, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f16>
%int32_2347 = torch.constant.int 32
%int8_2348 = torch.constant.int 8
%int128_2349 = torch.constant.int 128
%2445 = torch.prim.ListConstruct %2434, %int32_2347, %int8_2348, %int128_2349 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2446 = torch.aten.view %2444, %2445 : !torch.vtensor<[?,32,2,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f16>
torch.bind_symbolic_shape %2446, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f16>
%int1_2350 = torch.constant.int 1
%int32_2351 = torch.constant.int 32
%int8_2352 = torch.constant.int 8
%int128_2353 = torch.constant.int 128
%2447 = torch.prim.ListConstruct %int1_2350, %670, %int32_2351, %int8_2352, %int128_2353 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2448 = torch.aten.view %2422, %2447 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %2448, [%548], affine_map<()[s0] -> (1, s0, 32, 8, 128)> : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
%int32_2354 = torch.constant.int 32
%int8_2355 = torch.constant.int 8
%int128_2356 = torch.constant.int 128
%2449 = torch.prim.ListConstruct %670, %int32_2354, %int8_2355, %int128_2356 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2450 = torch.aten.view %2448, %2449 : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %2450, [%548], affine_map<()[s0] -> (s0, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
%int1_2357 = torch.constant.int 1
%int1_2358 = torch.constant.int 1
%2451 = torch.aten.add.Scalar %2424, %int1_2357, %int1_2358 : !torch.vtensor<[1,?],si64>, !torch.int, !torch.int -> !torch.vtensor<[1,?],si64>
torch.bind_symbolic_shape %2451, [%548], affine_map<()[s0] -> (1, s0)> : !torch.vtensor<[1,?],si64>
%2452 = torch.prim.ListConstruct %670 : (!torch.int) -> !torch.list<int>
%2453 = torch.aten.view %2451, %2452 : !torch.vtensor<[1,?],si64>, !torch.list<int> -> !torch.vtensor<[?],si64>
torch.bind_symbolic_shape %2453, [%548], affine_map<()[s0] -> (s0)> : !torch.vtensor<[?],si64>
%2454 = torch.prim.ListConstruct %2453 : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
%false_2359 = torch.constant.bool false
%2455 = torch.aten.index_put %2446, %2454, %2450, %false_2359 : !torch.vtensor<[?,32,8,128],f16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool -> !torch.vtensor<[?,32,8,128],f16>
torch.bind_symbolic_shape %2455, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f16>
%int32_2360 = torch.constant.int 32
%int2_2361 = torch.constant.int 2
%int32_2362 = torch.constant.int 32
%int8_2363 = torch.constant.int 8
%int128_2364 = torch.constant.int 128
%2456 = torch.prim.ListConstruct %661, %int32_2360, %int2_2361, %int32_2362, %int8_2363, %int128_2364 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2457 = torch.aten.view %2455, %2456 : !torch.vtensor<[?,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f16>
torch.bind_symbolic_shape %2457, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f16>
%int2097152_2365 = torch.constant.int 2097152
%2458 = torch.prim.ListConstruct %661, %int2097152_2365 : (!torch.int, !torch.int) -> !torch.list<int>
%2459 = torch.aten.view %2457, %2458 : !torch.vtensor<[?,32,2,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,2097152],f16>
torch.bind_symbolic_shape %2459, [%549], affine_map<()[s0] -> (s0, 2097152)> : !torch.vtensor<[?,2097152],f16>
%int-2_2366 = torch.constant.int -2
%2460 = torch.aten.unsqueeze %2419, %int-2_2366 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %2460, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 1, 128)> : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
%int1_2367 = torch.constant.int 1
%int8_2368 = torch.constant.int 8
%int4_2369 = torch.constant.int 4
%int128_2370 = torch.constant.int 128
%2461 = torch.prim.ListConstruct %int1_2367, %2401, %int8_2368, %int4_2369, %int128_2370 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%false_2371 = torch.constant.bool false
%2462 = torch.aten.expand %2460, %2461, %false_2371 : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %2462, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
%int0_2372 = torch.constant.int 0
%2463 = torch.aten.clone %2462, %int0_2372 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %2463, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
%int1_2373 = torch.constant.int 1
%int32_2374 = torch.constant.int 32
%int128_2375 = torch.constant.int 128
%2464 = torch.prim.ListConstruct %int1_2373, %2401, %int32_2374, %int128_2375 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2465 = torch.aten._unsafe_view %2463, %2464 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %2465, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
%int-2_2376 = torch.constant.int -2
%2466 = torch.aten.unsqueeze %2422, %int-2_2376 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %2466, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 1, 128)> : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
%int1_2377 = torch.constant.int 1
%2467 = torch.aten.size.int %2353, %int1_2377 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.int
%int1_2378 = torch.constant.int 1
%int8_2379 = torch.constant.int 8
%int4_2380 = torch.constant.int 4
%int128_2381 = torch.constant.int 128
%2468 = torch.prim.ListConstruct %int1_2378, %2467, %int8_2379, %int4_2380, %int128_2381 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%false_2382 = torch.constant.bool false
%2469 = torch.aten.expand %2466, %2468, %false_2382 : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %2469, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
%int0_2383 = torch.constant.int 0
%2470 = torch.aten.clone %2469, %int0_2383 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %2470, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
%int1_2384 = torch.constant.int 1
%int32_2385 = torch.constant.int 32
%int128_2386 = torch.constant.int 128
%2471 = torch.prim.ListConstruct %int1_2384, %2467, %int32_2385, %int128_2386 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2472 = torch.aten._unsafe_view %2470, %2471 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %2472, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
%int6_2387 = torch.constant.int 6
%2473 = torch.prims.convert_element_type %2465, %int6_2387 : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,32,128],f32>
torch.bind_symbolic_shape %2473, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
%2474 = torch.aten.mul.Tensor %2473, %144 : !torch.vtensor<[1,?,32,128],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,32,128],f32>
torch.bind_symbolic_shape %2474, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
%int15_2388 = torch.constant.int 15
%2475 = torch.prims.convert_element_type %2474, %int15_2388 : !torch.vtensor<[1,?,32,128],f32>, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
torch.bind_symbolic_shape %2475, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
%int6_2389 = torch.constant.int 6
%2476 = torch.prims.convert_element_type %2472, %int6_2389 : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,32,128],f32>
torch.bind_symbolic_shape %2476, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
%2477 = torch.aten.mul.Tensor %2476, %144 : !torch.vtensor<[1,?,32,128],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,32,128],f32>
torch.bind_symbolic_shape %2477, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
%int15_2390 = torch.constant.int 15
%2478 = torch.prims.convert_element_type %2477, %int15_2390 : !torch.vtensor<[1,?,32,128],f32>, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
torch.bind_symbolic_shape %2478, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
%int1_2391 = torch.constant.int 1
%int2_2392 = torch.constant.int 2
%2479 = torch.aten.transpose.int %2388, %int1_2391, %int2_2392 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
torch.bind_symbolic_shape %2479, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
%int1_2393 = torch.constant.int 1
%int2_2394 = torch.constant.int 2
%2480 = torch.aten.transpose.int %2475, %int1_2393, %int2_2394 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
torch.bind_symbolic_shape %2480, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
%int1_2395 = torch.constant.int 1
%int2_2396 = torch.constant.int 2
%2481 = torch.aten.transpose.int %2478, %int1_2395, %int2_2396 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
torch.bind_symbolic_shape %2481, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
%float0.000000e00_2397 = torch.constant.float 0.000000e+00
%true_2398 = torch.constant.bool true
%none_2399 = torch.constant.none
%none_2400 = torch.constant.none
%2482:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%2479, %2480, %2481, %float0.000000e00_2397, %true_2398, %none_2399, %none_2400) : (!torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,32,?,128],bf16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,32,?],f32>)
torch.bind_symbolic_shape %2482#0, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
%int1_2401 = torch.constant.int 1
%int2_2402 = torch.constant.int 2
%2483 = torch.aten.transpose.int %2482#0, %int1_2401, %int2_2402 : !torch.vtensor<[1,32,?,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
torch.bind_symbolic_shape %2483, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
%int1_2403 = torch.constant.int 1
%int4096_2404 = torch.constant.int 4096
%2484 = torch.prim.ListConstruct %int1_2403, %2373, %int4096_2404 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2485 = torch.aten.view %2483, %2484 : !torch.vtensor<[1,?,32,128],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,4096],bf16>
torch.bind_symbolic_shape %2485, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
%2486 = torch.aten.div.Tensor %2485, %145 : !torch.vtensor<[1,?,4096],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],bf16>
torch.bind_symbolic_shape %2486, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
%float-2.400000e02_2405 = torch.constant.float -2.400000e+02
%float2.400000e02_2406 = torch.constant.float 2.400000e+02
%2487 = torch.aten.clamp %2486, %float-2.400000e02_2405, %float2.400000e02_2406 : !torch.vtensor<[1,?,4096],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],bf16>
torch.bind_symbolic_shape %2487, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
%int26_2407 = torch.constant.int 26
%2488 = torch.prims.convert_element_type %2487, %int26_2407 : !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %2488, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
%int-2_2408 = torch.constant.int -2
%int-1_2409 = torch.constant.int -1
%2489 = torch.aten.transpose.int %146, %int-2_2408, %int-1_2409 : !torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%int4096_2410 = torch.constant.int 4096
%2490 = torch.prim.ListConstruct %2373, %int4096_2410 : (!torch.int, !torch.int) -> !torch.list<int>
%2491 = torch.aten.view %2488, %2490 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %2491, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
%2492 = torch.aten.mm %2491, %2489 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %2492, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
%int1_2411 = torch.constant.int 1
%int4096_2412 = torch.constant.int 4096
%2493 = torch.prim.ListConstruct %int1_2411, %2373, %int4096_2412 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2494 = torch.aten.view %2492, %2493 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %2494, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
%int15_2413 = torch.constant.int 15
%2495 = torch.prims.convert_element_type %2494, %int15_2413 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
torch.bind_symbolic_shape %2495, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
%int1_2414 = torch.constant.int 1
%2496 = torch.aten.add.Tensor %2317, %2495, %int1_2414 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %2496, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%int2_2415 = torch.constant.int 2
%2497 = torch.aten.pow.Tensor_Scalar %2496, %int2_2415 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %2497, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%int-1_2416 = torch.constant.int -1
%2498 = torch.prim.ListConstruct %int-1_2416 : (!torch.int) -> !torch.list<int>
%true_2417 = torch.constant.bool true
%none_2418 = torch.constant.none
%2499 = torch.aten.mean.dim %2497, %2498, %true_2417, %none_2418 : !torch.vtensor<[1,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,?,1],f32>
torch.bind_symbolic_shape %2499, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
%float1.000000e-05_2419 = torch.constant.float 1.000000e-05
%int1_2420 = torch.constant.int 1
%2500 = torch.aten.add.Scalar %2499, %float1.000000e-05_2419, %int1_2420 : !torch.vtensor<[1,?,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,?,1],f32>
torch.bind_symbolic_shape %2500, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
%2501 = torch.aten.rsqrt %2500 : !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,1],f32>
torch.bind_symbolic_shape %2501, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
%2502 = torch.aten.mul.Tensor %2496, %2501 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %2502, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%2503 = torch.aten.mul.Tensor %147, %2502 : !torch.vtensor<[4096],bf16>, !torch.vtensor<[1,?,4096],f32> -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %2503, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%2504 = torch.aten.div.Tensor %2503, %148 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %2504, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%float-2.400000e02_2421 = torch.constant.float -2.400000e+02
%float2.400000e02_2422 = torch.constant.float 2.400000e+02
%2505 = torch.aten.clamp %2504, %float-2.400000e02_2421, %float2.400000e02_2422 : !torch.vtensor<[1,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %2505, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%int26_2423 = torch.constant.int 26
%2506 = torch.prims.convert_element_type %2505, %int26_2423 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %2506, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
%int-2_2424 = torch.constant.int -2
%int-1_2425 = torch.constant.int -1
%2507 = torch.aten.transpose.int %149, %int-2_2424, %int-1_2425 : !torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
%int4096_2426 = torch.constant.int 4096
%2508 = torch.prim.ListConstruct %566, %int4096_2426 : (!torch.int, !torch.int) -> !torch.list<int>
%2509 = torch.aten.view %2506, %2508 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %2509, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
%2510 = torch.aten.mm %2509, %2507 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,14336],f8E4M3FNUZ> -> !torch.vtensor<[?,14336],f8E4M3FNUZ>
torch.bind_symbolic_shape %2510, [%548], affine_map<()[s0] -> (s0 * 32, 14336)> : !torch.vtensor<[?,14336],f8E4M3FNUZ>
%int1_2427 = torch.constant.int 1
%int14336_2428 = torch.constant.int 14336
%2511 = torch.prim.ListConstruct %int1_2427, %566, %int14336_2428 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2512 = torch.aten.view %2510, %2511 : !torch.vtensor<[?,14336],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
torch.bind_symbolic_shape %2512, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
%int15_2429 = torch.constant.int 15
%2513 = torch.prims.convert_element_type %2512, %int15_2429 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,14336],bf16>
torch.bind_symbolic_shape %2513, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
%2514 = torch.aten.silu %2513 : !torch.vtensor<[1,?,14336],bf16> -> !torch.vtensor<[1,?,14336],bf16>
torch.bind_symbolic_shape %2514, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
%2515 = torch.aten.div.Tensor %2503, %150 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %2515, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%float-2.400000e02_2430 = torch.constant.float -2.400000e+02
%float2.400000e02_2431 = torch.constant.float 2.400000e+02
%2516 = torch.aten.clamp %2515, %float-2.400000e02_2430, %float2.400000e02_2431 : !torch.vtensor<[1,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %2516, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%int26_2432 = torch.constant.int 26
%2517 = torch.prims.convert_element_type %2516, %int26_2432 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %2517, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
%int-2_2433 = torch.constant.int -2
%int-1_2434 = torch.constant.int -1
%2518 = torch.aten.transpose.int %151, %int-2_2433, %int-1_2434 : !torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
%int4096_2435 = torch.constant.int 4096
%2519 = torch.prim.ListConstruct %566, %int4096_2435 : (!torch.int, !torch.int) -> !torch.list<int>
%2520 = torch.aten.view %2517, %2519 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %2520, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
%2521 = torch.aten.mm %2520, %2518 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,14336],f8E4M3FNUZ> -> !torch.vtensor<[?,14336],f8E4M3FNUZ>
torch.bind_symbolic_shape %2521, [%548], affine_map<()[s0] -> (s0 * 32, 14336)> : !torch.vtensor<[?,14336],f8E4M3FNUZ>
%int1_2436 = torch.constant.int 1
%int14336_2437 = torch.constant.int 14336
%2522 = torch.prim.ListConstruct %int1_2436, %566, %int14336_2437 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2523 = torch.aten.view %2521, %2522 : !torch.vtensor<[?,14336],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
torch.bind_symbolic_shape %2523, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
%int15_2438 = torch.constant.int 15
%2524 = torch.prims.convert_element_type %2523, %int15_2438 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,14336],bf16>
torch.bind_symbolic_shape %2524, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
%2525 = torch.aten.mul.Tensor %2514, %2524 : !torch.vtensor<[1,?,14336],bf16>, !torch.vtensor<[1,?,14336],bf16> -> !torch.vtensor<[1,?,14336],bf16>
torch.bind_symbolic_shape %2525, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
%2526 = torch.aten.div.Tensor %2525, %152 : !torch.vtensor<[1,?,14336],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,14336],bf16>
torch.bind_symbolic_shape %2526, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
%float-2.400000e02_2439 = torch.constant.float -2.400000e+02
%float2.400000e02_2440 = torch.constant.float 2.400000e+02
%2527 = torch.aten.clamp %2526, %float-2.400000e02_2439, %float2.400000e02_2440 : !torch.vtensor<[1,?,14336],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,14336],bf16>
torch.bind_symbolic_shape %2527, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
%int26_2441 = torch.constant.int 26
%2528 = torch.prims.convert_element_type %2527, %int26_2441 : !torch.vtensor<[1,?,14336],bf16>, !torch.int -> !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
torch.bind_symbolic_shape %2528, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
%int-2_2442 = torch.constant.int -2
%int-1_2443 = torch.constant.int -1
%2529 = torch.aten.transpose.int %153, %int-2_2442, %int-1_2443 : !torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%int1_2444 = torch.constant.int 1
%2530 = torch.aten.size.int %2512, %int1_2444 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.int -> !torch.int
%int14336_2445 = torch.constant.int 14336
%2531 = torch.prim.ListConstruct %2530, %int14336_2445 : (!torch.int, !torch.int) -> !torch.list<int>
%2532 = torch.aten.view %2528, %2531 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,14336],f8E4M3FNUZ>
torch.bind_symbolic_shape %2532, [%548], affine_map<()[s0] -> (s0 * 32, 14336)> : !torch.vtensor<[?,14336],f8E4M3FNUZ>
%2533 = torch.aten.mm %2532, %2529 : !torch.vtensor<[?,14336],f8E4M3FNUZ>, !torch.vtensor<[14336,4096],f8E4M3FNUZ> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %2533, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
%int1_2446 = torch.constant.int 1
%int4096_2447 = torch.constant.int 4096
%2534 = torch.prim.ListConstruct %int1_2446, %2530, %int4096_2447 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2535 = torch.aten.view %2533, %2534 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %2535, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
%int15_2448 = torch.constant.int 15
%2536 = torch.prims.convert_element_type %2535, %int15_2448 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
torch.bind_symbolic_shape %2536, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
%int1_2449 = torch.constant.int 1
%2537 = torch.aten.add.Tensor %2496, %2536, %int1_2449 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %2537, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%int2_2450 = torch.constant.int 2
%2538 = torch.aten.pow.Tensor_Scalar %2537, %int2_2450 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %2538, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%int-1_2451 = torch.constant.int -1
%2539 = torch.prim.ListConstruct %int-1_2451 : (!torch.int) -> !torch.list<int>
%true_2452 = torch.constant.bool true
%none_2453 = torch.constant.none
%2540 = torch.aten.mean.dim %2538, %2539, %true_2452, %none_2453 : !torch.vtensor<[1,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,?,1],f32>
torch.bind_symbolic_shape %2540, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
%float1.000000e-05_2454 = torch.constant.float 1.000000e-05
%int1_2455 = torch.constant.int 1
%2541 = torch.aten.add.Scalar %2540, %float1.000000e-05_2454, %int1_2455 : !torch.vtensor<[1,?,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,?,1],f32>
torch.bind_symbolic_shape %2541, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
%2542 = torch.aten.rsqrt %2541 : !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,1],f32>
torch.bind_symbolic_shape %2542, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
%2543 = torch.aten.mul.Tensor %2537, %2542 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %2543, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%2544 = torch.aten.mul.Tensor %154, %2543 : !torch.vtensor<[4096],bf16>, !torch.vtensor<[1,?,4096],f32> -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %2544, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%2545 = torch.aten.div.Tensor %2544, %155 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %2545, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%float-2.400000e02_2456 = torch.constant.float -2.400000e+02
%float2.400000e02_2457 = torch.constant.float 2.400000e+02
%2546 = torch.aten.clamp %2545, %float-2.400000e02_2456, %float2.400000e02_2457 : !torch.vtensor<[1,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %2546, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%int26_2458 = torch.constant.int 26
%2547 = torch.prims.convert_element_type %2546, %int26_2458 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %2547, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
%int-2_2459 = torch.constant.int -2
%int-1_2460 = torch.constant.int -1
%2548 = torch.aten.transpose.int %156, %int-2_2459, %int-1_2460 : !torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%int4096_2461 = torch.constant.int 4096
%2549 = torch.prim.ListConstruct %566, %int4096_2461 : (!torch.int, !torch.int) -> !torch.list<int>
%2550 = torch.aten.view %2547, %2549 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %2550, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
%2551 = torch.aten.mm %2550, %2548 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %2551, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
%int1_2462 = torch.constant.int 1
%int4096_2463 = torch.constant.int 4096
%2552 = torch.prim.ListConstruct %int1_2462, %566, %int4096_2463 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2553 = torch.aten.view %2551, %2552 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %2553, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
%int15_2464 = torch.constant.int 15
%2554 = torch.prims.convert_element_type %2553, %int15_2464 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
torch.bind_symbolic_shape %2554, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
%2555 = torch.aten.div.Tensor %2544, %157 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %2555, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%float-2.400000e02_2465 = torch.constant.float -2.400000e+02
%float2.400000e02_2466 = torch.constant.float 2.400000e+02
%2556 = torch.aten.clamp %2555, %float-2.400000e02_2465, %float2.400000e02_2466 : !torch.vtensor<[1,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %2556, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%int26_2467 = torch.constant.int 26
%2557 = torch.prims.convert_element_type %2556, %int26_2467 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %2557, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
%int-2_2468 = torch.constant.int -2
%int-1_2469 = torch.constant.int -1
%2558 = torch.aten.transpose.int %158, %int-2_2468, %int-1_2469 : !torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,1024],f8E4M3FNUZ>
%int4096_2470 = torch.constant.int 4096
%2559 = torch.prim.ListConstruct %566, %int4096_2470 : (!torch.int, !torch.int) -> !torch.list<int>
%2560 = torch.aten.view %2557, %2559 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %2560, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
%2561 = torch.aten.mm %2560, %2558 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,1024],f8E4M3FNUZ> -> !torch.vtensor<[?,1024],f8E4M3FNUZ>
torch.bind_symbolic_shape %2561, [%548], affine_map<()[s0] -> (s0 * 32, 1024)> : !torch.vtensor<[?,1024],f8E4M3FNUZ>
%int1_2471 = torch.constant.int 1
%int1024_2472 = torch.constant.int 1024
%2562 = torch.prim.ListConstruct %int1_2471, %566, %int1024_2472 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2563 = torch.aten.view %2561, %2562 : !torch.vtensor<[?,1024],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
torch.bind_symbolic_shape %2563, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
%int15_2473 = torch.constant.int 15
%2564 = torch.prims.convert_element_type %2563, %int15_2473 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,1024],bf16>
torch.bind_symbolic_shape %2564, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],bf16>
%2565 = torch.aten.div.Tensor %2544, %159 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %2565, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%float-2.400000e02_2474 = torch.constant.float -2.400000e+02
%float2.400000e02_2475 = torch.constant.float 2.400000e+02
%2566 = torch.aten.clamp %2565, %float-2.400000e02_2474, %float2.400000e02_2475 : !torch.vtensor<[1,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %2566, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%int26_2476 = torch.constant.int 26
%2567 = torch.prims.convert_element_type %2566, %int26_2476 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %2567, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
%int-2_2477 = torch.constant.int -2
%int-1_2478 = torch.constant.int -1
%2568 = torch.aten.transpose.int %160, %int-2_2477, %int-1_2478 : !torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,1024],f8E4M3FNUZ>
%int4096_2479 = torch.constant.int 4096
%2569 = torch.prim.ListConstruct %566, %int4096_2479 : (!torch.int, !torch.int) -> !torch.list<int>
%2570 = torch.aten.view %2567, %2569 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %2570, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
%2571 = torch.aten.mm %2570, %2568 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,1024],f8E4M3FNUZ> -> !torch.vtensor<[?,1024],f8E4M3FNUZ>
torch.bind_symbolic_shape %2571, [%548], affine_map<()[s0] -> (s0 * 32, 1024)> : !torch.vtensor<[?,1024],f8E4M3FNUZ>
%int1_2480 = torch.constant.int 1
%int1024_2481 = torch.constant.int 1024
%2572 = torch.prim.ListConstruct %int1_2480, %566, %int1024_2481 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2573 = torch.aten.view %2571, %2572 : !torch.vtensor<[?,1024],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
torch.bind_symbolic_shape %2573, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
%int15_2482 = torch.constant.int 15
%2574 = torch.prims.convert_element_type %2573, %int15_2482 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,1024],bf16>
torch.bind_symbolic_shape %2574, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],bf16>
%int1_2483 = torch.constant.int 1
%int32_2484 = torch.constant.int 32
%int128_2485 = torch.constant.int 128
%2575 = torch.prim.ListConstruct %int1_2483, %566, %int32_2484, %int128_2485 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2576 = torch.aten.view %2554, %2575 : !torch.vtensor<[1,?,4096],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,32,128],bf16>
torch.bind_symbolic_shape %2576, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
%int1_2486 = torch.constant.int 1
%int8_2487 = torch.constant.int 8
%int128_2488 = torch.constant.int 128
%2577 = torch.prim.ListConstruct %int1_2486, %566, %int8_2487, %int128_2488 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2578 = torch.aten.view %2564, %2577 : !torch.vtensor<[1,?,1024],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,8,128],bf16>
torch.bind_symbolic_shape %2578, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
%int1_2489 = torch.constant.int 1
%int8_2490 = torch.constant.int 8
%int128_2491 = torch.constant.int 128
%2579 = torch.prim.ListConstruct %int1_2489, %566, %int8_2490, %int128_2491 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2580 = torch.aten.view %2574, %2579 : !torch.vtensor<[1,?,1024],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,8,128],bf16>
torch.bind_symbolic_shape %2580, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
%int131072_2492 = torch.constant.int 131072
%none_2493 = torch.constant.none
%none_2494 = torch.constant.none
%cpu_2495 = torch.constant.device "cpu"
%false_2496 = torch.constant.bool false
%2581 = torch.aten.arange %int131072_2492, %none_2493, %none_2494, %cpu_2495, %false_2496 : !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[131072],si64>
%int0_2497 = torch.constant.int 0
%int128_2498 = torch.constant.int 128
%none_2499 = torch.constant.none
%none_2500 = torch.constant.none
%cpu_2501 = torch.constant.device "cpu"
%false_2502 = torch.constant.bool false
%2582 = torch.aten.arange.start %int0_2497, %int128_2498, %none_2499, %none_2500, %cpu_2501, %false_2502 : !torch.int, !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[128],si64>
%int2_2503 = torch.constant.int 2
%2583 = torch.aten.floor_divide.Scalar %2582, %int2_2503 : !torch.vtensor<[128],si64>, !torch.int -> !torch.vtensor<[128],si64>
%int6_2504 = torch.constant.int 6
%2584 = torch.prims.convert_element_type %2583, %int6_2504 : !torch.vtensor<[128],si64>, !torch.int -> !torch.vtensor<[128],f32>
%int128_2505 = torch.constant.int 128
%2585 = torch.aten.div.Scalar %2584, %int128_2505 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[128],f32>
%float2.000000e00_2506 = torch.constant.float 2.000000e+00
%2586 = torch.aten.mul.Scalar %2585, %float2.000000e00_2506 : !torch.vtensor<[128],f32>, !torch.float -> !torch.vtensor<[128],f32>
%float5.000000e05_2507 = torch.constant.float 5.000000e+05
%2587 = torch.aten.pow.Scalar %float5.000000e05_2507, %2586 : !torch.float, !torch.vtensor<[128],f32> -> !torch.vtensor<[128],f32>
%2588 = torch.aten.reciprocal %2587 : !torch.vtensor<[128],f32> -> !torch.vtensor<[128],f32>
%float1.000000e00_2508 = torch.constant.float 1.000000e+00
%2589 = torch.aten.mul.Scalar %2588, %float1.000000e00_2508 : !torch.vtensor<[128],f32>, !torch.float -> !torch.vtensor<[128],f32>
%int131072_2509 = torch.constant.int 131072
%int1_2510 = torch.constant.int 1
%2590 = torch.prim.ListConstruct %int131072_2509, %int1_2510 : (!torch.int, !torch.int) -> !torch.list<int>
%2591 = torch.aten.view %2581, %2590 : !torch.vtensor<[131072],si64>, !torch.list<int> -> !torch.vtensor<[131072,1],si64>
%2592 = torch.aten.mul.Tensor %2591, %2589 : !torch.vtensor<[131072,1],si64>, !torch.vtensor<[128],f32> -> !torch.vtensor<[131072,128],f32>
%int1_2511 = torch.constant.int 1
%2593 = torch.aten.size.int %2553, %int1_2511 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.int
%int0_2512 = torch.constant.int 0
%2594 = torch.aten.add.int %int0_2512, %2593 : !torch.int, !torch.int -> !torch.int
%int0_2513 = torch.constant.int 0
%int0_2514 = torch.constant.int 0
%int1_2515 = torch.constant.int 1
%2595 = torch.aten.slice.Tensor %2592, %int0_2513, %int0_2514, %2594, %int1_2515 : !torch.vtensor<[131072,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
torch.bind_symbolic_shape %2595, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
%int1_2516 = torch.constant.int 1
%int0_2517 = torch.constant.int 0
%int9223372036854775807_2518 = torch.constant.int 9223372036854775807
%int1_2519 = torch.constant.int 1
%2596 = torch.aten.slice.Tensor %2595, %int1_2516, %int0_2517, %int9223372036854775807_2518, %int1_2519 : !torch.vtensor<[?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
torch.bind_symbolic_shape %2596, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
%int1_2520 = torch.constant.int 1
%int0_2521 = torch.constant.int 0
%int9223372036854775807_2522 = torch.constant.int 9223372036854775807
%int1_2523 = torch.constant.int 1
%2597 = torch.aten.slice.Tensor %2596, %int1_2520, %int0_2521, %int9223372036854775807_2522, %int1_2523 : !torch.vtensor<[?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
torch.bind_symbolic_shape %2597, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
%int0_2524 = torch.constant.int 0
%2598 = torch.aten.unsqueeze %2597, %int0_2524 : !torch.vtensor<[?,128],f32>, !torch.int -> !torch.vtensor<[1,?,128],f32>
torch.bind_symbolic_shape %2598, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
%int1_2525 = torch.constant.int 1
%int0_2526 = torch.constant.int 0
%int9223372036854775807_2527 = torch.constant.int 9223372036854775807
%int1_2528 = torch.constant.int 1
%2599 = torch.aten.slice.Tensor %2598, %int1_2525, %int0_2526, %int9223372036854775807_2527, %int1_2528 : !torch.vtensor<[1,?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],f32>
torch.bind_symbolic_shape %2599, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
%int2_2529 = torch.constant.int 2
%int0_2530 = torch.constant.int 0
%int9223372036854775807_2531 = torch.constant.int 9223372036854775807
%int1_2532 = torch.constant.int 1
%2600 = torch.aten.slice.Tensor %2599, %int2_2529, %int0_2530, %int9223372036854775807_2531, %int1_2532 : !torch.vtensor<[1,?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],f32>
torch.bind_symbolic_shape %2600, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
%int1_2533 = torch.constant.int 1
%int1_2534 = torch.constant.int 1
%int1_2535 = torch.constant.int 1
%2601 = torch.prim.ListConstruct %int1_2533, %int1_2534, %int1_2535 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2602 = torch.aten.repeat %2600, %2601 : !torch.vtensor<[1,?,128],f32>, !torch.list<int> -> !torch.vtensor<[1,?,128],f32>
torch.bind_symbolic_shape %2602, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
%int6_2536 = torch.constant.int 6
%2603 = torch.prims.convert_element_type %2576, %int6_2536 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int -> !torch.vtensor<[1,?,32,128],f32>
torch.bind_symbolic_shape %2603, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
%2604 = torch_c.to_builtin_tensor %2603 : !torch.vtensor<[1,?,32,128],f32> -> tensor<1x?x32x128xf32>
%2605 = torch_c.to_builtin_tensor %2602 : !torch.vtensor<[1,?,128],f32> -> tensor<1x?x128xf32>
%2606 = util.call @sharktank_rotary_embedding_1_D_32_128_f32(%2604, %2605) : (tensor<1x?x32x128xf32>, tensor<1x?x128xf32>) -> tensor<1x?x32x128xf32>
%2607 = torch_c.from_builtin_tensor %2606 : tensor<1x?x32x128xf32> -> !torch.vtensor<[1,?,32,128],f32>
torch.bind_symbolic_shape %2607, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
%int15_2537 = torch.constant.int 15
%2608 = torch.prims.convert_element_type %2607, %int15_2537 : !torch.vtensor<[1,?,32,128],f32>, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
torch.bind_symbolic_shape %2608, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
%int131072_2538 = torch.constant.int 131072
%none_2539 = torch.constant.none
%none_2540 = torch.constant.none
%cpu_2541 = torch.constant.device "cpu"
%false_2542 = torch.constant.bool false
%2609 = torch.aten.arange %int131072_2538, %none_2539, %none_2540, %cpu_2541, %false_2542 : !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[131072],si64>
%int0_2543 = torch.constant.int 0
%int128_2544 = torch.constant.int 128
%none_2545 = torch.constant.none
%none_2546 = torch.constant.none
%cpu_2547 = torch.constant.device "cpu"
%false_2548 = torch.constant.bool false
%2610 = torch.aten.arange.start %int0_2543, %int128_2544, %none_2545, %none_2546, %cpu_2547, %false_2548 : !torch.int, !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[128],si64>
%int2_2549 = torch.constant.int 2
%2611 = torch.aten.floor_divide.Scalar %2610, %int2_2549 : !torch.vtensor<[128],si64>, !torch.int -> !torch.vtensor<[128],si64>
%int6_2550 = torch.constant.int 6
%2612 = torch.prims.convert_element_type %2611, %int6_2550 : !torch.vtensor<[128],si64>, !torch.int -> !torch.vtensor<[128],f32>
%int128_2551 = torch.constant.int 128
%2613 = torch.aten.div.Scalar %2612, %int128_2551 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[128],f32>
%float2.000000e00_2552 = torch.constant.float 2.000000e+00
%2614 = torch.aten.mul.Scalar %2613, %float2.000000e00_2552 : !torch.vtensor<[128],f32>, !torch.float -> !torch.vtensor<[128],f32>
%float5.000000e05_2553 = torch.constant.float 5.000000e+05
%2615 = torch.aten.pow.Scalar %float5.000000e05_2553, %2614 : !torch.float, !torch.vtensor<[128],f32> -> !torch.vtensor<[128],f32>
%2616 = torch.aten.reciprocal %2615 : !torch.vtensor<[128],f32> -> !torch.vtensor<[128],f32>
%float1.000000e00_2554 = torch.constant.float 1.000000e+00
%2617 = torch.aten.mul.Scalar %2616, %float1.000000e00_2554 : !torch.vtensor<[128],f32>, !torch.float -> !torch.vtensor<[128],f32>
%int131072_2555 = torch.constant.int 131072
%int1_2556 = torch.constant.int 1
%2618 = torch.prim.ListConstruct %int131072_2555, %int1_2556 : (!torch.int, !torch.int) -> !torch.list<int>
%2619 = torch.aten.view %2609, %2618 : !torch.vtensor<[131072],si64>, !torch.list<int> -> !torch.vtensor<[131072,1],si64>
%2620 = torch.aten.mul.Tensor %2619, %2617 : !torch.vtensor<[131072,1],si64>, !torch.vtensor<[128],f32> -> !torch.vtensor<[131072,128],f32>
%int1_2557 = torch.constant.int 1
%2621 = torch.aten.size.int %2563, %int1_2557 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.int
%int0_2558 = torch.constant.int 0
%2622 = torch.aten.add.int %int0_2558, %2621 : !torch.int, !torch.int -> !torch.int
%int0_2559 = torch.constant.int 0
%int0_2560 = torch.constant.int 0
%int1_2561 = torch.constant.int 1
%2623 = torch.aten.slice.Tensor %2620, %int0_2559, %int0_2560, %2622, %int1_2561 : !torch.vtensor<[131072,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
torch.bind_symbolic_shape %2623, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
%int1_2562 = torch.constant.int 1
%int0_2563 = torch.constant.int 0
%int9223372036854775807_2564 = torch.constant.int 9223372036854775807
%int1_2565 = torch.constant.int 1
%2624 = torch.aten.slice.Tensor %2623, %int1_2562, %int0_2563, %int9223372036854775807_2564, %int1_2565 : !torch.vtensor<[?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
torch.bind_symbolic_shape %2624, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
%int1_2566 = torch.constant.int 1
%int0_2567 = torch.constant.int 0
%int9223372036854775807_2568 = torch.constant.int 9223372036854775807
%int1_2569 = torch.constant.int 1
%2625 = torch.aten.slice.Tensor %2624, %int1_2566, %int0_2567, %int9223372036854775807_2568, %int1_2569 : !torch.vtensor<[?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
torch.bind_symbolic_shape %2625, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
%int0_2570 = torch.constant.int 0
%2626 = torch.aten.unsqueeze %2625, %int0_2570 : !torch.vtensor<[?,128],f32>, !torch.int -> !torch.vtensor<[1,?,128],f32>
torch.bind_symbolic_shape %2626, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
%int1_2571 = torch.constant.int 1
%int0_2572 = torch.constant.int 0
%int9223372036854775807_2573 = torch.constant.int 9223372036854775807
%int1_2574 = torch.constant.int 1
%2627 = torch.aten.slice.Tensor %2626, %int1_2571, %int0_2572, %int9223372036854775807_2573, %int1_2574 : !torch.vtensor<[1,?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],f32>
torch.bind_symbolic_shape %2627, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
%int2_2575 = torch.constant.int 2
%int0_2576 = torch.constant.int 0
%int9223372036854775807_2577 = torch.constant.int 9223372036854775807
%int1_2578 = torch.constant.int 1
%2628 = torch.aten.slice.Tensor %2627, %int2_2575, %int0_2576, %int9223372036854775807_2577, %int1_2578 : !torch.vtensor<[1,?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],f32>
torch.bind_symbolic_shape %2628, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
%int1_2579 = torch.constant.int 1
%int1_2580 = torch.constant.int 1
%int1_2581 = torch.constant.int 1
%2629 = torch.prim.ListConstruct %int1_2579, %int1_2580, %int1_2581 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2630 = torch.aten.repeat %2628, %2629 : !torch.vtensor<[1,?,128],f32>, !torch.list<int> -> !torch.vtensor<[1,?,128],f32>
torch.bind_symbolic_shape %2630, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
%int6_2582 = torch.constant.int 6
%2631 = torch.prims.convert_element_type %2578, %int6_2582 : !torch.vtensor<[1,?,8,128],bf16>, !torch.int -> !torch.vtensor<[1,?,8,128],f32>
torch.bind_symbolic_shape %2631, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f32>
%2632 = torch_c.to_builtin_tensor %2631 : !torch.vtensor<[1,?,8,128],f32> -> tensor<1x?x8x128xf32>
%2633 = torch_c.to_builtin_tensor %2630 : !torch.vtensor<[1,?,128],f32> -> tensor<1x?x128xf32>
%2634 = util.call @sharktank_rotary_embedding_1_D_8_128_f32(%2632, %2633) : (tensor<1x?x8x128xf32>, tensor<1x?x128xf32>) -> tensor<1x?x8x128xf32>
%2635 = torch_c.from_builtin_tensor %2634 : tensor<1x?x8x128xf32> -> !torch.vtensor<[1,?,8,128],f32>
torch.bind_symbolic_shape %2635, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f32>
%int15_2583 = torch.constant.int 15
%2636 = torch.prims.convert_element_type %2635, %int15_2583 : !torch.vtensor<[1,?,8,128],f32>, !torch.int -> !torch.vtensor<[1,?,8,128],bf16>
torch.bind_symbolic_shape %2636, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
%2637 = torch.aten.div.Tensor %2636, %161 : !torch.vtensor<[1,?,8,128],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,8,128],bf16>
torch.bind_symbolic_shape %2637, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
%float-2.400000e02_2584 = torch.constant.float -2.400000e+02
%float2.400000e02_2585 = torch.constant.float 2.400000e+02
%2638 = torch.aten.clamp %2637, %float-2.400000e02_2584, %float2.400000e02_2585 : !torch.vtensor<[1,?,8,128],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,8,128],bf16>
torch.bind_symbolic_shape %2638, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
%int26_2586 = torch.constant.int 26
%2639 = torch.prims.convert_element_type %2638, %int26_2586 : !torch.vtensor<[1,?,8,128],bf16>, !torch.int -> !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %2639, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
%2640 = torch.aten.div.Tensor %2580, %161 : !torch.vtensor<[1,?,8,128],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,8,128],bf16>
torch.bind_symbolic_shape %2640, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
%float-2.400000e02_2587 = torch.constant.float -2.400000e+02
%float2.400000e02_2588 = torch.constant.float 2.400000e+02
%2641 = torch.aten.clamp %2640, %float-2.400000e02_2587, %float2.400000e02_2588 : !torch.vtensor<[1,?,8,128],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,8,128],bf16>
torch.bind_symbolic_shape %2641, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
%int26_2589 = torch.constant.int 26
%2642 = torch.prims.convert_element_type %2641, %int26_2589 : !torch.vtensor<[1,?,8,128],bf16>, !torch.int -> !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %2642, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
%int64_2590 = torch.constant.int 64
%2643 = torch.aten.mul.Scalar %arg2, %int64_2590 : !torch.vtensor<[1,?],si64>, !torch.int -> !torch.vtensor<[1,?],si64>
torch.bind_symbolic_shape %2643, [%548], affine_map<()[s0] -> (1, s0)> : !torch.vtensor<[1,?],si64>
%int18 = torch.constant.int 18
%int1_2591 = torch.constant.int 1
%2644 = torch.aten.add.Scalar %2643, %int18, %int1_2591 : !torch.vtensor<[1,?],si64>, !torch.int, !torch.int -> !torch.vtensor<[1,?],si64>
torch.bind_symbolic_shape %2644, [%548], affine_map<()[s0] -> (1, s0)> : !torch.vtensor<[1,?],si64>
%int1_2592 = torch.constant.int 1
%int32_2593 = torch.constant.int 32
%int8_2594 = torch.constant.int 8
%int128_2595 = torch.constant.int 128
%2645 = torch.prim.ListConstruct %int1_2592, %670, %int32_2593, %int8_2594, %int128_2595 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2646 = torch.aten.view %2639, %2645 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %2646, [%548], affine_map<()[s0] -> (1, s0, 32, 8, 128)> : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
%int32_2596 = torch.constant.int 32
%int8_2597 = torch.constant.int 8
%int128_2598 = torch.constant.int 128
%2647 = torch.prim.ListConstruct %670, %int32_2596, %int8_2597, %int128_2598 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2648 = torch.aten.view %2646, %2647 : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %2648, [%548], affine_map<()[s0] -> (s0, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
%2649 = torch.prim.ListConstruct %670 : (!torch.int) -> !torch.list<int>
%2650 = torch.aten.view %2644, %2649 : !torch.vtensor<[1,?],si64>, !torch.list<int> -> !torch.vtensor<[?],si64>
torch.bind_symbolic_shape %2650, [%548], affine_map<()[s0] -> (s0)> : !torch.vtensor<[?],si64>
%int32_2599 = torch.constant.int 32
%int2_2600 = torch.constant.int 2
%int32_2601 = torch.constant.int 32
%int8_2602 = torch.constant.int 8
%int128_2603 = torch.constant.int 128
%2651 = torch.prim.ListConstruct %661, %int32_2599, %int2_2600, %int32_2601, %int8_2602, %int128_2603 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2652 = torch.aten.view %2459, %2651 : !torch.vtensor<[?,2097152],f16>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f16>
torch.bind_symbolic_shape %2652, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f16>
%int32_2604 = torch.constant.int 32
%2653 = torch.aten.mul.int %661, %int32_2604 : !torch.int, !torch.int -> !torch.int
%int2_2605 = torch.constant.int 2
%2654 = torch.aten.mul.int %2653, %int2_2605 : !torch.int, !torch.int -> !torch.int
%int32_2606 = torch.constant.int 32
%int8_2607 = torch.constant.int 8
%int128_2608 = torch.constant.int 128
%2655 = torch.prim.ListConstruct %2654, %int32_2606, %int8_2607, %int128_2608 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2656 = torch.aten.view %2652, %2655 : !torch.vtensor<[?,32,2,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f16>
torch.bind_symbolic_shape %2656, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f16>
%2657 = torch.prim.ListConstruct %2650 : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
%false_2609 = torch.constant.bool false
%2658 = torch.aten.index_put %2656, %2657, %2648, %false_2609 : !torch.vtensor<[?,32,8,128],f16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool -> !torch.vtensor<[?,32,8,128],f16>
torch.bind_symbolic_shape %2658, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f16>
%int32_2610 = torch.constant.int 32
%int2_2611 = torch.constant.int 2
%int32_2612 = torch.constant.int 32
%int8_2613 = torch.constant.int 8
%int128_2614 = torch.constant.int 128
%2659 = torch.prim.ListConstruct %661, %int32_2610, %int2_2611, %int32_2612, %int8_2613, %int128_2614 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2660 = torch.aten.view %2658, %2659 : !torch.vtensor<[?,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f16>
torch.bind_symbolic_shape %2660, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f16>
%int2097152_2615 = torch.constant.int 2097152
%2661 = torch.prim.ListConstruct %661, %int2097152_2615 : (!torch.int, !torch.int) -> !torch.list<int>
%2662 = torch.aten.view %2660, %2661 : !torch.vtensor<[?,32,2,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,2097152],f16>
torch.bind_symbolic_shape %2662, [%549], affine_map<()[s0] -> (s0, 2097152)> : !torch.vtensor<[?,2097152],f16>
%int32_2616 = torch.constant.int 32
%int2_2617 = torch.constant.int 2
%int32_2618 = torch.constant.int 32
%int8_2619 = torch.constant.int 8
%int128_2620 = torch.constant.int 128
%2663 = torch.prim.ListConstruct %661, %int32_2616, %int2_2617, %int32_2618, %int8_2619, %int128_2620 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2664 = torch.aten.view %2662, %2663 : !torch.vtensor<[?,2097152],f16>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f16>
torch.bind_symbolic_shape %2664, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f16>
%int32_2621 = torch.constant.int 32
%int8_2622 = torch.constant.int 8
%int128_2623 = torch.constant.int 128
%2665 = torch.prim.ListConstruct %2654, %int32_2621, %int8_2622, %int128_2623 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2666 = torch.aten.view %2664, %2665 : !torch.vtensor<[?,32,2,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f16>
torch.bind_symbolic_shape %2666, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f16>
%int1_2624 = torch.constant.int 1
%int32_2625 = torch.constant.int 32
%int8_2626 = torch.constant.int 8
%int128_2627 = torch.constant.int 128
%2667 = torch.prim.ListConstruct %int1_2624, %670, %int32_2625, %int8_2626, %int128_2627 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2668 = torch.aten.view %2642, %2667 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %2668, [%548], affine_map<()[s0] -> (1, s0, 32, 8, 128)> : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
%int32_2628 = torch.constant.int 32
%int8_2629 = torch.constant.int 8
%int128_2630 = torch.constant.int 128
%2669 = torch.prim.ListConstruct %670, %int32_2628, %int8_2629, %int128_2630 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2670 = torch.aten.view %2668, %2669 : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %2670, [%548], affine_map<()[s0] -> (s0, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
%int1_2631 = torch.constant.int 1
%int1_2632 = torch.constant.int 1
%2671 = torch.aten.add.Scalar %2644, %int1_2631, %int1_2632 : !torch.vtensor<[1,?],si64>, !torch.int, !torch.int -> !torch.vtensor<[1,?],si64>
torch.bind_symbolic_shape %2671, [%548], affine_map<()[s0] -> (1, s0)> : !torch.vtensor<[1,?],si64>
%2672 = torch.prim.ListConstruct %670 : (!torch.int) -> !torch.list<int>
%2673 = torch.aten.view %2671, %2672 : !torch.vtensor<[1,?],si64>, !torch.list<int> -> !torch.vtensor<[?],si64>
torch.bind_symbolic_shape %2673, [%548], affine_map<()[s0] -> (s0)> : !torch.vtensor<[?],si64>
%2674 = torch.prim.ListConstruct %2673 : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
%false_2633 = torch.constant.bool false
%2675 = torch.aten.index_put %2666, %2674, %2670, %false_2633 : !torch.vtensor<[?,32,8,128],f16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool -> !torch.vtensor<[?,32,8,128],f16>
torch.bind_symbolic_shape %2675, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f16>
%int32_2634 = torch.constant.int 32
%int2_2635 = torch.constant.int 2
%int32_2636 = torch.constant.int 32
%int8_2637 = torch.constant.int 8
%int128_2638 = torch.constant.int 128
%2676 = torch.prim.ListConstruct %661, %int32_2634, %int2_2635, %int32_2636, %int8_2637, %int128_2638 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2677 = torch.aten.view %2675, %2676 : !torch.vtensor<[?,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f16>
torch.bind_symbolic_shape %2677, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f16>
%int2097152_2639 = torch.constant.int 2097152
%2678 = torch.prim.ListConstruct %661, %int2097152_2639 : (!torch.int, !torch.int) -> !torch.list<int>
%2679 = torch.aten.view %2677, %2678 : !torch.vtensor<[?,32,2,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,2097152],f16>
torch.bind_symbolic_shape %2679, [%549], affine_map<()[s0] -> (s0, 2097152)> : !torch.vtensor<[?,2097152],f16>
%int-2_2640 = torch.constant.int -2
%2680 = torch.aten.unsqueeze %2639, %int-2_2640 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %2680, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 1, 128)> : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
%int1_2641 = torch.constant.int 1
%int8_2642 = torch.constant.int 8
%int4_2643 = torch.constant.int 4
%int128_2644 = torch.constant.int 128
%2681 = torch.prim.ListConstruct %int1_2641, %2621, %int8_2642, %int4_2643, %int128_2644 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%false_2645 = torch.constant.bool false
%2682 = torch.aten.expand %2680, %2681, %false_2645 : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %2682, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
%int0_2646 = torch.constant.int 0
%2683 = torch.aten.clone %2682, %int0_2646 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %2683, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
%int1_2647 = torch.constant.int 1
%int32_2648 = torch.constant.int 32
%int128_2649 = torch.constant.int 128
%2684 = torch.prim.ListConstruct %int1_2647, %2621, %int32_2648, %int128_2649 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2685 = torch.aten._unsafe_view %2683, %2684 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %2685, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
%int-2_2650 = torch.constant.int -2
%2686 = torch.aten.unsqueeze %2642, %int-2_2650 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %2686, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 1, 128)> : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
%int1_2651 = torch.constant.int 1
%2687 = torch.aten.size.int %2573, %int1_2651 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.int
%int1_2652 = torch.constant.int 1
%int8_2653 = torch.constant.int 8
%int4_2654 = torch.constant.int 4
%int128_2655 = torch.constant.int 128
%2688 = torch.prim.ListConstruct %int1_2652, %2687, %int8_2653, %int4_2654, %int128_2655 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%false_2656 = torch.constant.bool false
%2689 = torch.aten.expand %2686, %2688, %false_2656 : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %2689, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
%int0_2657 = torch.constant.int 0
%2690 = torch.aten.clone %2689, %int0_2657 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %2690, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
%int1_2658 = torch.constant.int 1
%int32_2659 = torch.constant.int 32
%int128_2660 = torch.constant.int 128
%2691 = torch.prim.ListConstruct %int1_2658, %2687, %int32_2659, %int128_2660 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2692 = torch.aten._unsafe_view %2690, %2691 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %2692, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
%int6_2661 = torch.constant.int 6
%2693 = torch.prims.convert_element_type %2685, %int6_2661 : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,32,128],f32>
torch.bind_symbolic_shape %2693, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
%2694 = torch.aten.mul.Tensor %2693, %161 : !torch.vtensor<[1,?,32,128],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,32,128],f32>
torch.bind_symbolic_shape %2694, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
%int15_2662 = torch.constant.int 15
%2695 = torch.prims.convert_element_type %2694, %int15_2662 : !torch.vtensor<[1,?,32,128],f32>, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
torch.bind_symbolic_shape %2695, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
%int6_2663 = torch.constant.int 6
%2696 = torch.prims.convert_element_type %2692, %int6_2663 : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,32,128],f32>
torch.bind_symbolic_shape %2696, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
%2697 = torch.aten.mul.Tensor %2696, %161 : !torch.vtensor<[1,?,32,128],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,32,128],f32>
torch.bind_symbolic_shape %2697, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
%int15_2664 = torch.constant.int 15
%2698 = torch.prims.convert_element_type %2697, %int15_2664 : !torch.vtensor<[1,?,32,128],f32>, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
torch.bind_symbolic_shape %2698, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
%int1_2665 = torch.constant.int 1
%int2_2666 = torch.constant.int 2
%2699 = torch.aten.transpose.int %2608, %int1_2665, %int2_2666 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
torch.bind_symbolic_shape %2699, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
%int1_2667 = torch.constant.int 1
%int2_2668 = torch.constant.int 2
%2700 = torch.aten.transpose.int %2695, %int1_2667, %int2_2668 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
torch.bind_symbolic_shape %2700, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
%int1_2669 = torch.constant.int 1
%int2_2670 = torch.constant.int 2
%2701 = torch.aten.transpose.int %2698, %int1_2669, %int2_2670 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
torch.bind_symbolic_shape %2701, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
%float0.000000e00_2671 = torch.constant.float 0.000000e+00
%true_2672 = torch.constant.bool true
%none_2673 = torch.constant.none
%none_2674 = torch.constant.none
%2702:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%2699, %2700, %2701, %float0.000000e00_2671, %true_2672, %none_2673, %none_2674) : (!torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,32,?,128],bf16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,32,?],f32>)
torch.bind_symbolic_shape %2702#0, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
%int1_2675 = torch.constant.int 1
%int2_2676 = torch.constant.int 2
%2703 = torch.aten.transpose.int %2702#0, %int1_2675, %int2_2676 : !torch.vtensor<[1,32,?,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
torch.bind_symbolic_shape %2703, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
%int1_2677 = torch.constant.int 1
%int4096_2678 = torch.constant.int 4096
%2704 = torch.prim.ListConstruct %int1_2677, %2593, %int4096_2678 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2705 = torch.aten.view %2703, %2704 : !torch.vtensor<[1,?,32,128],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,4096],bf16>
torch.bind_symbolic_shape %2705, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
%2706 = torch.aten.div.Tensor %2705, %162 : !torch.vtensor<[1,?,4096],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],bf16>
torch.bind_symbolic_shape %2706, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
%float-2.400000e02_2679 = torch.constant.float -2.400000e+02
%float2.400000e02_2680 = torch.constant.float 2.400000e+02
%2707 = torch.aten.clamp %2706, %float-2.400000e02_2679, %float2.400000e02_2680 : !torch.vtensor<[1,?,4096],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],bf16>
torch.bind_symbolic_shape %2707, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
%int26_2681 = torch.constant.int 26
%2708 = torch.prims.convert_element_type %2707, %int26_2681 : !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %2708, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
%int-2_2682 = torch.constant.int -2
%int-1_2683 = torch.constant.int -1
%2709 = torch.aten.transpose.int %163, %int-2_2682, %int-1_2683 : !torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%int4096_2684 = torch.constant.int 4096
%2710 = torch.prim.ListConstruct %2593, %int4096_2684 : (!torch.int, !torch.int) -> !torch.list<int>
%2711 = torch.aten.view %2708, %2710 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %2711, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
%2712 = torch.aten.mm %2711, %2709 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %2712, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
%int1_2685 = torch.constant.int 1
%int4096_2686 = torch.constant.int 4096
%2713 = torch.prim.ListConstruct %int1_2685, %2593, %int4096_2686 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2714 = torch.aten.view %2712, %2713 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %2714, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
%int15_2687 = torch.constant.int 15
%2715 = torch.prims.convert_element_type %2714, %int15_2687 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
torch.bind_symbolic_shape %2715, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
%int1_2688 = torch.constant.int 1
%2716 = torch.aten.add.Tensor %2537, %2715, %int1_2688 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %2716, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%int2_2689 = torch.constant.int 2
%2717 = torch.aten.pow.Tensor_Scalar %2716, %int2_2689 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %2717, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%int-1_2690 = torch.constant.int -1
%2718 = torch.prim.ListConstruct %int-1_2690 : (!torch.int) -> !torch.list<int>
%true_2691 = torch.constant.bool true
%none_2692 = torch.constant.none
%2719 = torch.aten.mean.dim %2717, %2718, %true_2691, %none_2692 : !torch.vtensor<[1,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,?,1],f32>
torch.bind_symbolic_shape %2719, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
%float1.000000e-05_2693 = torch.constant.float 1.000000e-05
%int1_2694 = torch.constant.int 1
%2720 = torch.aten.add.Scalar %2719, %float1.000000e-05_2693, %int1_2694 : !torch.vtensor<[1,?,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,?,1],f32>
torch.bind_symbolic_shape %2720, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
%2721 = torch.aten.rsqrt %2720 : !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,1],f32>
torch.bind_symbolic_shape %2721, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
%2722 = torch.aten.mul.Tensor %2716, %2721 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %2722, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%2723 = torch.aten.mul.Tensor %164, %2722 : !torch.vtensor<[4096],bf16>, !torch.vtensor<[1,?,4096],f32> -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %2723, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%2724 = torch.aten.div.Tensor %2723, %165 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %2724, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%float-2.400000e02_2695 = torch.constant.float -2.400000e+02
%float2.400000e02_2696 = torch.constant.float 2.400000e+02
%2725 = torch.aten.clamp %2724, %float-2.400000e02_2695, %float2.400000e02_2696 : !torch.vtensor<[1,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %2725, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%int26_2697 = torch.constant.int 26
%2726 = torch.prims.convert_element_type %2725, %int26_2697 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %2726, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
%int-2_2698 = torch.constant.int -2
%int-1_2699 = torch.constant.int -1
%2727 = torch.aten.transpose.int %166, %int-2_2698, %int-1_2699 : !torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
%int4096_2700 = torch.constant.int 4096
%2728 = torch.prim.ListConstruct %566, %int4096_2700 : (!torch.int, !torch.int) -> !torch.list<int>
%2729 = torch.aten.view %2726, %2728 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %2729, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
%2730 = torch.aten.mm %2729, %2727 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,14336],f8E4M3FNUZ> -> !torch.vtensor<[?,14336],f8E4M3FNUZ>
torch.bind_symbolic_shape %2730, [%548], affine_map<()[s0] -> (s0 * 32, 14336)> : !torch.vtensor<[?,14336],f8E4M3FNUZ>
%int1_2701 = torch.constant.int 1
%int14336_2702 = torch.constant.int 14336
%2731 = torch.prim.ListConstruct %int1_2701, %566, %int14336_2702 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2732 = torch.aten.view %2730, %2731 : !torch.vtensor<[?,14336],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
torch.bind_symbolic_shape %2732, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
%int15_2703 = torch.constant.int 15
%2733 = torch.prims.convert_element_type %2732, %int15_2703 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,14336],bf16>
torch.bind_symbolic_shape %2733, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
%2734 = torch.aten.silu %2733 : !torch.vtensor<[1,?,14336],bf16> -> !torch.vtensor<[1,?,14336],bf16>
torch.bind_symbolic_shape %2734, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
%2735 = torch.aten.div.Tensor %2723, %167 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %2735, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%float-2.400000e02_2704 = torch.constant.float -2.400000e+02
%float2.400000e02_2705 = torch.constant.float 2.400000e+02
%2736 = torch.aten.clamp %2735, %float-2.400000e02_2704, %float2.400000e02_2705 : !torch.vtensor<[1,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %2736, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%int26_2706 = torch.constant.int 26
%2737 = torch.prims.convert_element_type %2736, %int26_2706 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %2737, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
%int-2_2707 = torch.constant.int -2
%int-1_2708 = torch.constant.int -1
%2738 = torch.aten.transpose.int %168, %int-2_2707, %int-1_2708 : !torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
%int4096_2709 = torch.constant.int 4096
%2739 = torch.prim.ListConstruct %566, %int4096_2709 : (!torch.int, !torch.int) -> !torch.list<int>
%2740 = torch.aten.view %2737, %2739 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %2740, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
%2741 = torch.aten.mm %2740, %2738 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,14336],f8E4M3FNUZ> -> !torch.vtensor<[?,14336],f8E4M3FNUZ>
torch.bind_symbolic_shape %2741, [%548], affine_map<()[s0] -> (s0 * 32, 14336)> : !torch.vtensor<[?,14336],f8E4M3FNUZ>
%int1_2710 = torch.constant.int 1
%int14336_2711 = torch.constant.int 14336
%2742 = torch.prim.ListConstruct %int1_2710, %566, %int14336_2711 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2743 = torch.aten.view %2741, %2742 : !torch.vtensor<[?,14336],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
torch.bind_symbolic_shape %2743, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
%int15_2712 = torch.constant.int 15
%2744 = torch.prims.convert_element_type %2743, %int15_2712 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,14336],bf16>
torch.bind_symbolic_shape %2744, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
%2745 = torch.aten.mul.Tensor %2734, %2744 : !torch.vtensor<[1,?,14336],bf16>, !torch.vtensor<[1,?,14336],bf16> -> !torch.vtensor<[1,?,14336],bf16>
torch.bind_symbolic_shape %2745, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
%2746 = torch.aten.div.Tensor %2745, %169 : !torch.vtensor<[1,?,14336],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,14336],bf16>
torch.bind_symbolic_shape %2746, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
%float-2.400000e02_2713 = torch.constant.float -2.400000e+02
%float2.400000e02_2714 = torch.constant.float 2.400000e+02
%2747 = torch.aten.clamp %2746, %float-2.400000e02_2713, %float2.400000e02_2714 : !torch.vtensor<[1,?,14336],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,14336],bf16>
torch.bind_symbolic_shape %2747, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
%int26_2715 = torch.constant.int 26
%2748 = torch.prims.convert_element_type %2747, %int26_2715 : !torch.vtensor<[1,?,14336],bf16>, !torch.int -> !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
torch.bind_symbolic_shape %2748, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
%int-2_2716 = torch.constant.int -2
%int-1_2717 = torch.constant.int -1
%2749 = torch.aten.transpose.int %170, %int-2_2716, %int-1_2717 : !torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%int1_2718 = torch.constant.int 1
%2750 = torch.aten.size.int %2732, %int1_2718 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.int -> !torch.int
%int14336_2719 = torch.constant.int 14336
%2751 = torch.prim.ListConstruct %2750, %int14336_2719 : (!torch.int, !torch.int) -> !torch.list<int>
%2752 = torch.aten.view %2748, %2751 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,14336],f8E4M3FNUZ>
torch.bind_symbolic_shape %2752, [%548], affine_map<()[s0] -> (s0 * 32, 14336)> : !torch.vtensor<[?,14336],f8E4M3FNUZ>
%2753 = torch.aten.mm %2752, %2749 : !torch.vtensor<[?,14336],f8E4M3FNUZ>, !torch.vtensor<[14336,4096],f8E4M3FNUZ> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %2753, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
%int1_2720 = torch.constant.int 1
%int4096_2721 = torch.constant.int 4096
%2754 = torch.prim.ListConstruct %int1_2720, %2750, %int4096_2721 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2755 = torch.aten.view %2753, %2754 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %2755, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
%int15_2722 = torch.constant.int 15
%2756 = torch.prims.convert_element_type %2755, %int15_2722 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
torch.bind_symbolic_shape %2756, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
%int1_2723 = torch.constant.int 1
%2757 = torch.aten.add.Tensor %2716, %2756, %int1_2723 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %2757, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%int2_2724 = torch.constant.int 2
%2758 = torch.aten.pow.Tensor_Scalar %2757, %int2_2724 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %2758, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%int-1_2725 = torch.constant.int -1
%2759 = torch.prim.ListConstruct %int-1_2725 : (!torch.int) -> !torch.list<int>
%true_2726 = torch.constant.bool true
%none_2727 = torch.constant.none
%2760 = torch.aten.mean.dim %2758, %2759, %true_2726, %none_2727 : !torch.vtensor<[1,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,?,1],f32>
torch.bind_symbolic_shape %2760, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
%float1.000000e-05_2728 = torch.constant.float 1.000000e-05
%int1_2729 = torch.constant.int 1
%2761 = torch.aten.add.Scalar %2760, %float1.000000e-05_2728, %int1_2729 : !torch.vtensor<[1,?,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,?,1],f32>
torch.bind_symbolic_shape %2761, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
%2762 = torch.aten.rsqrt %2761 : !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,1],f32>
torch.bind_symbolic_shape %2762, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
%2763 = torch.aten.mul.Tensor %2757, %2762 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %2763, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%2764 = torch.aten.mul.Tensor %171, %2763 : !torch.vtensor<[4096],bf16>, !torch.vtensor<[1,?,4096],f32> -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %2764, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%2765 = torch.aten.div.Tensor %2764, %172 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %2765, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%float-2.400000e02_2730 = torch.constant.float -2.400000e+02
%float2.400000e02_2731 = torch.constant.float 2.400000e+02
%2766 = torch.aten.clamp %2765, %float-2.400000e02_2730, %float2.400000e02_2731 : !torch.vtensor<[1,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %2766, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%int26_2732 = torch.constant.int 26
%2767 = torch.prims.convert_element_type %2766, %int26_2732 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %2767, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
%int-2_2733 = torch.constant.int -2
%int-1_2734 = torch.constant.int -1
%2768 = torch.aten.transpose.int %173, %int-2_2733, %int-1_2734 : !torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%int4096_2735 = torch.constant.int 4096
%2769 = torch.prim.ListConstruct %566, %int4096_2735 : (!torch.int, !torch.int) -> !torch.list<int>
%2770 = torch.aten.view %2767, %2769 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %2770, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
%2771 = torch.aten.mm %2770, %2768 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %2771, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
%int1_2736 = torch.constant.int 1
%int4096_2737 = torch.constant.int 4096
%2772 = torch.prim.ListConstruct %int1_2736, %566, %int4096_2737 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2773 = torch.aten.view %2771, %2772 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %2773, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
%int15_2738 = torch.constant.int 15
%2774 = torch.prims.convert_element_type %2773, %int15_2738 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
torch.bind_symbolic_shape %2774, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
%2775 = torch.aten.div.Tensor %2764, %174 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %2775, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%float-2.400000e02_2739 = torch.constant.float -2.400000e+02
%float2.400000e02_2740 = torch.constant.float 2.400000e+02
%2776 = torch.aten.clamp %2775, %float-2.400000e02_2739, %float2.400000e02_2740 : !torch.vtensor<[1,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %2776, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%int26_2741 = torch.constant.int 26
%2777 = torch.prims.convert_element_type %2776, %int26_2741 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %2777, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
%int-2_2742 = torch.constant.int -2
%int-1_2743 = torch.constant.int -1
%2778 = torch.aten.transpose.int %175, %int-2_2742, %int-1_2743 : !torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,1024],f8E4M3FNUZ>
%int4096_2744 = torch.constant.int 4096
%2779 = torch.prim.ListConstruct %566, %int4096_2744 : (!torch.int, !torch.int) -> !torch.list<int>
%2780 = torch.aten.view %2777, %2779 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %2780, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
%2781 = torch.aten.mm %2780, %2778 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,1024],f8E4M3FNUZ> -> !torch.vtensor<[?,1024],f8E4M3FNUZ>
torch.bind_symbolic_shape %2781, [%548], affine_map<()[s0] -> (s0 * 32, 1024)> : !torch.vtensor<[?,1024],f8E4M3FNUZ>
%int1_2745 = torch.constant.int 1
%int1024_2746 = torch.constant.int 1024
%2782 = torch.prim.ListConstruct %int1_2745, %566, %int1024_2746 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2783 = torch.aten.view %2781, %2782 : !torch.vtensor<[?,1024],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
torch.bind_symbolic_shape %2783, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
%int15_2747 = torch.constant.int 15
%2784 = torch.prims.convert_element_type %2783, %int15_2747 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,1024],bf16>
torch.bind_symbolic_shape %2784, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],bf16>
%2785 = torch.aten.div.Tensor %2764, %176 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %2785, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%float-2.400000e02_2748 = torch.constant.float -2.400000e+02
%float2.400000e02_2749 = torch.constant.float 2.400000e+02
%2786 = torch.aten.clamp %2785, %float-2.400000e02_2748, %float2.400000e02_2749 : !torch.vtensor<[1,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %2786, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%int26_2750 = torch.constant.int 26
%2787 = torch.prims.convert_element_type %2786, %int26_2750 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %2787, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
%int-2_2751 = torch.constant.int -2
%int-1_2752 = torch.constant.int -1
%2788 = torch.aten.transpose.int %177, %int-2_2751, %int-1_2752 : !torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,1024],f8E4M3FNUZ>
%int4096_2753 = torch.constant.int 4096
%2789 = torch.prim.ListConstruct %566, %int4096_2753 : (!torch.int, !torch.int) -> !torch.list<int>
%2790 = torch.aten.view %2787, %2789 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %2790, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
%2791 = torch.aten.mm %2790, %2788 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,1024],f8E4M3FNUZ> -> !torch.vtensor<[?,1024],f8E4M3FNUZ>
torch.bind_symbolic_shape %2791, [%548], affine_map<()[s0] -> (s0 * 32, 1024)> : !torch.vtensor<[?,1024],f8E4M3FNUZ>
%int1_2754 = torch.constant.int 1
%int1024_2755 = torch.constant.int 1024
%2792 = torch.prim.ListConstruct %int1_2754, %566, %int1024_2755 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2793 = torch.aten.view %2791, %2792 : !torch.vtensor<[?,1024],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
torch.bind_symbolic_shape %2793, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
%int15_2756 = torch.constant.int 15
%2794 = torch.prims.convert_element_type %2793, %int15_2756 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,1024],bf16>
torch.bind_symbolic_shape %2794, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],bf16>
%int1_2757 = torch.constant.int 1
%int32_2758 = torch.constant.int 32
%int128_2759 = torch.constant.int 128
%2795 = torch.prim.ListConstruct %int1_2757, %566, %int32_2758, %int128_2759 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2796 = torch.aten.view %2774, %2795 : !torch.vtensor<[1,?,4096],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,32,128],bf16>
torch.bind_symbolic_shape %2796, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
%int1_2760 = torch.constant.int 1
%int8_2761 = torch.constant.int 8
%int128_2762 = torch.constant.int 128
%2797 = torch.prim.ListConstruct %int1_2760, %566, %int8_2761, %int128_2762 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2798 = torch.aten.view %2784, %2797 : !torch.vtensor<[1,?,1024],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,8,128],bf16>
torch.bind_symbolic_shape %2798, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
%int1_2763 = torch.constant.int 1
%int8_2764 = torch.constant.int 8
%int128_2765 = torch.constant.int 128
%2799 = torch.prim.ListConstruct %int1_2763, %566, %int8_2764, %int128_2765 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2800 = torch.aten.view %2794, %2799 : !torch.vtensor<[1,?,1024],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,8,128],bf16>
torch.bind_symbolic_shape %2800, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
%int131072_2766 = torch.constant.int 131072
%none_2767 = torch.constant.none
%none_2768 = torch.constant.none
%cpu_2769 = torch.constant.device "cpu"
%false_2770 = torch.constant.bool false
%2801 = torch.aten.arange %int131072_2766, %none_2767, %none_2768, %cpu_2769, %false_2770 : !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[131072],si64>
%int0_2771 = torch.constant.int 0
%int128_2772 = torch.constant.int 128
%none_2773 = torch.constant.none
%none_2774 = torch.constant.none
%cpu_2775 = torch.constant.device "cpu"
%false_2776 = torch.constant.bool false
%2802 = torch.aten.arange.start %int0_2771, %int128_2772, %none_2773, %none_2774, %cpu_2775, %false_2776 : !torch.int, !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[128],si64>
%int2_2777 = torch.constant.int 2
%2803 = torch.aten.floor_divide.Scalar %2802, %int2_2777 : !torch.vtensor<[128],si64>, !torch.int -> !torch.vtensor<[128],si64>
%int6_2778 = torch.constant.int 6
%2804 = torch.prims.convert_element_type %2803, %int6_2778 : !torch.vtensor<[128],si64>, !torch.int -> !torch.vtensor<[128],f32>
%int128_2779 = torch.constant.int 128
%2805 = torch.aten.div.Scalar %2804, %int128_2779 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[128],f32>
%float2.000000e00_2780 = torch.constant.float 2.000000e+00
%2806 = torch.aten.mul.Scalar %2805, %float2.000000e00_2780 : !torch.vtensor<[128],f32>, !torch.float -> !torch.vtensor<[128],f32>
%float5.000000e05_2781 = torch.constant.float 5.000000e+05
%2807 = torch.aten.pow.Scalar %float5.000000e05_2781, %2806 : !torch.float, !torch.vtensor<[128],f32> -> !torch.vtensor<[128],f32>
%2808 = torch.aten.reciprocal %2807 : !torch.vtensor<[128],f32> -> !torch.vtensor<[128],f32>
%float1.000000e00_2782 = torch.constant.float 1.000000e+00
%2809 = torch.aten.mul.Scalar %2808, %float1.000000e00_2782 : !torch.vtensor<[128],f32>, !torch.float -> !torch.vtensor<[128],f32>
%int131072_2783 = torch.constant.int 131072
%int1_2784 = torch.constant.int 1
%2810 = torch.prim.ListConstruct %int131072_2783, %int1_2784 : (!torch.int, !torch.int) -> !torch.list<int>
%2811 = torch.aten.view %2801, %2810 : !torch.vtensor<[131072],si64>, !torch.list<int> -> !torch.vtensor<[131072,1],si64>
%2812 = torch.aten.mul.Tensor %2811, %2809 : !torch.vtensor<[131072,1],si64>, !torch.vtensor<[128],f32> -> !torch.vtensor<[131072,128],f32>
%int1_2785 = torch.constant.int 1
%2813 = torch.aten.size.int %2773, %int1_2785 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.int
%int0_2786 = torch.constant.int 0
%2814 = torch.aten.add.int %int0_2786, %2813 : !torch.int, !torch.int -> !torch.int
%int0_2787 = torch.constant.int 0
%int0_2788 = torch.constant.int 0
%int1_2789 = torch.constant.int 1
%2815 = torch.aten.slice.Tensor %2812, %int0_2787, %int0_2788, %2814, %int1_2789 : !torch.vtensor<[131072,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
torch.bind_symbolic_shape %2815, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
%int1_2790 = torch.constant.int 1
%int0_2791 = torch.constant.int 0
%int9223372036854775807_2792 = torch.constant.int 9223372036854775807
%int1_2793 = torch.constant.int 1
%2816 = torch.aten.slice.Tensor %2815, %int1_2790, %int0_2791, %int9223372036854775807_2792, %int1_2793 : !torch.vtensor<[?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
torch.bind_symbolic_shape %2816, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
%int1_2794 = torch.constant.int 1
%int0_2795 = torch.constant.int 0
%int9223372036854775807_2796 = torch.constant.int 9223372036854775807
%int1_2797 = torch.constant.int 1
%2817 = torch.aten.slice.Tensor %2816, %int1_2794, %int0_2795, %int9223372036854775807_2796, %int1_2797 : !torch.vtensor<[?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
torch.bind_symbolic_shape %2817, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
%int0_2798 = torch.constant.int 0
%2818 = torch.aten.unsqueeze %2817, %int0_2798 : !torch.vtensor<[?,128],f32>, !torch.int -> !torch.vtensor<[1,?,128],f32>
torch.bind_symbolic_shape %2818, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
%int1_2799 = torch.constant.int 1
%int0_2800 = torch.constant.int 0
%int9223372036854775807_2801 = torch.constant.int 9223372036854775807
%int1_2802 = torch.constant.int 1
%2819 = torch.aten.slice.Tensor %2818, %int1_2799, %int0_2800, %int9223372036854775807_2801, %int1_2802 : !torch.vtensor<[1,?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],f32>
torch.bind_symbolic_shape %2819, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
%int2_2803 = torch.constant.int 2
%int0_2804 = torch.constant.int 0
%int9223372036854775807_2805 = torch.constant.int 9223372036854775807
%int1_2806 = torch.constant.int 1
%2820 = torch.aten.slice.Tensor %2819, %int2_2803, %int0_2804, %int9223372036854775807_2805, %int1_2806 : !torch.vtensor<[1,?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],f32>
torch.bind_symbolic_shape %2820, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
%int1_2807 = torch.constant.int 1
%int1_2808 = torch.constant.int 1
%int1_2809 = torch.constant.int 1
%2821 = torch.prim.ListConstruct %int1_2807, %int1_2808, %int1_2809 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2822 = torch.aten.repeat %2820, %2821 : !torch.vtensor<[1,?,128],f32>, !torch.list<int> -> !torch.vtensor<[1,?,128],f32>
torch.bind_symbolic_shape %2822, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
%int6_2810 = torch.constant.int 6
%2823 = torch.prims.convert_element_type %2796, %int6_2810 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int -> !torch.vtensor<[1,?,32,128],f32>
torch.bind_symbolic_shape %2823, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
%2824 = torch_c.to_builtin_tensor %2823 : !torch.vtensor<[1,?,32,128],f32> -> tensor<1x?x32x128xf32>
%2825 = torch_c.to_builtin_tensor %2822 : !torch.vtensor<[1,?,128],f32> -> tensor<1x?x128xf32>
%2826 = util.call @sharktank_rotary_embedding_1_D_32_128_f32(%2824, %2825) : (tensor<1x?x32x128xf32>, tensor<1x?x128xf32>) -> tensor<1x?x32x128xf32>
%2827 = torch_c.from_builtin_tensor %2826 : tensor<1x?x32x128xf32> -> !torch.vtensor<[1,?,32,128],f32>
torch.bind_symbolic_shape %2827, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
%int15_2811 = torch.constant.int 15
%2828 = torch.prims.convert_element_type %2827, %int15_2811 : !torch.vtensor<[1,?,32,128],f32>, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
torch.bind_symbolic_shape %2828, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
%int131072_2812 = torch.constant.int 131072
%none_2813 = torch.constant.none
%none_2814 = torch.constant.none
%cpu_2815 = torch.constant.device "cpu"
%false_2816 = torch.constant.bool false
%2829 = torch.aten.arange %int131072_2812, %none_2813, %none_2814, %cpu_2815, %false_2816 : !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[131072],si64>
%int0_2817 = torch.constant.int 0
%int128_2818 = torch.constant.int 128
%none_2819 = torch.constant.none
%none_2820 = torch.constant.none
%cpu_2821 = torch.constant.device "cpu"
%false_2822 = torch.constant.bool false
%2830 = torch.aten.arange.start %int0_2817, %int128_2818, %none_2819, %none_2820, %cpu_2821, %false_2822 : !torch.int, !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[128],si64>
%int2_2823 = torch.constant.int 2
%2831 = torch.aten.floor_divide.Scalar %2830, %int2_2823 : !torch.vtensor<[128],si64>, !torch.int -> !torch.vtensor<[128],si64>
%int6_2824 = torch.constant.int 6
%2832 = torch.prims.convert_element_type %2831, %int6_2824 : !torch.vtensor<[128],si64>, !torch.int -> !torch.vtensor<[128],f32>
%int128_2825 = torch.constant.int 128
%2833 = torch.aten.div.Scalar %2832, %int128_2825 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[128],f32>
%float2.000000e00_2826 = torch.constant.float 2.000000e+00
%2834 = torch.aten.mul.Scalar %2833, %float2.000000e00_2826 : !torch.vtensor<[128],f32>, !torch.float -> !torch.vtensor<[128],f32>
%float5.000000e05_2827 = torch.constant.float 5.000000e+05
%2835 = torch.aten.pow.Scalar %float5.000000e05_2827, %2834 : !torch.float, !torch.vtensor<[128],f32> -> !torch.vtensor<[128],f32>
%2836 = torch.aten.reciprocal %2835 : !torch.vtensor<[128],f32> -> !torch.vtensor<[128],f32>
%float1.000000e00_2828 = torch.constant.float 1.000000e+00
%2837 = torch.aten.mul.Scalar %2836, %float1.000000e00_2828 : !torch.vtensor<[128],f32>, !torch.float -> !torch.vtensor<[128],f32>
%int131072_2829 = torch.constant.int 131072
%int1_2830 = torch.constant.int 1
%2838 = torch.prim.ListConstruct %int131072_2829, %int1_2830 : (!torch.int, !torch.int) -> !torch.list<int>
%2839 = torch.aten.view %2829, %2838 : !torch.vtensor<[131072],si64>, !torch.list<int> -> !torch.vtensor<[131072,1],si64>
%2840 = torch.aten.mul.Tensor %2839, %2837 : !torch.vtensor<[131072,1],si64>, !torch.vtensor<[128],f32> -> !torch.vtensor<[131072,128],f32>
%int1_2831 = torch.constant.int 1
%2841 = torch.aten.size.int %2783, %int1_2831 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.int
%int0_2832 = torch.constant.int 0
%2842 = torch.aten.add.int %int0_2832, %2841 : !torch.int, !torch.int -> !torch.int
%int0_2833 = torch.constant.int 0
%int0_2834 = torch.constant.int 0
%int1_2835 = torch.constant.int 1
%2843 = torch.aten.slice.Tensor %2840, %int0_2833, %int0_2834, %2842, %int1_2835 : !torch.vtensor<[131072,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
torch.bind_symbolic_shape %2843, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
%int1_2836 = torch.constant.int 1
%int0_2837 = torch.constant.int 0
%int9223372036854775807_2838 = torch.constant.int 9223372036854775807
%int1_2839 = torch.constant.int 1
%2844 = torch.aten.slice.Tensor %2843, %int1_2836, %int0_2837, %int9223372036854775807_2838, %int1_2839 : !torch.vtensor<[?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
torch.bind_symbolic_shape %2844, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
%int1_2840 = torch.constant.int 1
%int0_2841 = torch.constant.int 0
%int9223372036854775807_2842 = torch.constant.int 9223372036854775807
%int1_2843 = torch.constant.int 1
%2845 = torch.aten.slice.Tensor %2844, %int1_2840, %int0_2841, %int9223372036854775807_2842, %int1_2843 : !torch.vtensor<[?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
torch.bind_symbolic_shape %2845, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
%int0_2844 = torch.constant.int 0
%2846 = torch.aten.unsqueeze %2845, %int0_2844 : !torch.vtensor<[?,128],f32>, !torch.int -> !torch.vtensor<[1,?,128],f32>
torch.bind_symbolic_shape %2846, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
%int1_2845 = torch.constant.int 1
%int0_2846 = torch.constant.int 0
%int9223372036854775807_2847 = torch.constant.int 9223372036854775807
%int1_2848 = torch.constant.int 1
%2847 = torch.aten.slice.Tensor %2846, %int1_2845, %int0_2846, %int9223372036854775807_2847, %int1_2848 : !torch.vtensor<[1,?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],f32>
torch.bind_symbolic_shape %2847, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
%int2_2849 = torch.constant.int 2
%int0_2850 = torch.constant.int 0
%int9223372036854775807_2851 = torch.constant.int 9223372036854775807
%int1_2852 = torch.constant.int 1
%2848 = torch.aten.slice.Tensor %2847, %int2_2849, %int0_2850, %int9223372036854775807_2851, %int1_2852 : !torch.vtensor<[1,?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],f32>
torch.bind_symbolic_shape %2848, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
%int1_2853 = torch.constant.int 1
%int1_2854 = torch.constant.int 1
%int1_2855 = torch.constant.int 1
%2849 = torch.prim.ListConstruct %int1_2853, %int1_2854, %int1_2855 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2850 = torch.aten.repeat %2848, %2849 : !torch.vtensor<[1,?,128],f32>, !torch.list<int> -> !torch.vtensor<[1,?,128],f32>
torch.bind_symbolic_shape %2850, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
%int6_2856 = torch.constant.int 6
%2851 = torch.prims.convert_element_type %2798, %int6_2856 : !torch.vtensor<[1,?,8,128],bf16>, !torch.int -> !torch.vtensor<[1,?,8,128],f32>
torch.bind_symbolic_shape %2851, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f32>
%2852 = torch_c.to_builtin_tensor %2851 : !torch.vtensor<[1,?,8,128],f32> -> tensor<1x?x8x128xf32>
%2853 = torch_c.to_builtin_tensor %2850 : !torch.vtensor<[1,?,128],f32> -> tensor<1x?x128xf32>
%2854 = util.call @sharktank_rotary_embedding_1_D_8_128_f32(%2852, %2853) : (tensor<1x?x8x128xf32>, tensor<1x?x128xf32>) -> tensor<1x?x8x128xf32>
%2855 = torch_c.from_builtin_tensor %2854 : tensor<1x?x8x128xf32> -> !torch.vtensor<[1,?,8,128],f32>
torch.bind_symbolic_shape %2855, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f32>
%int15_2857 = torch.constant.int 15
%2856 = torch.prims.convert_element_type %2855, %int15_2857 : !torch.vtensor<[1,?,8,128],f32>, !torch.int -> !torch.vtensor<[1,?,8,128],bf16>
torch.bind_symbolic_shape %2856, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
%2857 = torch.aten.div.Tensor %2856, %178 : !torch.vtensor<[1,?,8,128],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,8,128],bf16>
torch.bind_symbolic_shape %2857, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
%float-2.400000e02_2858 = torch.constant.float -2.400000e+02
%float2.400000e02_2859 = torch.constant.float 2.400000e+02
%2858 = torch.aten.clamp %2857, %float-2.400000e02_2858, %float2.400000e02_2859 : !torch.vtensor<[1,?,8,128],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,8,128],bf16>
torch.bind_symbolic_shape %2858, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
%int26_2860 = torch.constant.int 26
%2859 = torch.prims.convert_element_type %2858, %int26_2860 : !torch.vtensor<[1,?,8,128],bf16>, !torch.int -> !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %2859, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
%2860 = torch.aten.div.Tensor %2800, %178 : !torch.vtensor<[1,?,8,128],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,8,128],bf16>
torch.bind_symbolic_shape %2860, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
%float-2.400000e02_2861 = torch.constant.float -2.400000e+02
%float2.400000e02_2862 = torch.constant.float 2.400000e+02
%2861 = torch.aten.clamp %2860, %float-2.400000e02_2861, %float2.400000e02_2862 : !torch.vtensor<[1,?,8,128],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,8,128],bf16>
torch.bind_symbolic_shape %2861, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
%int26_2863 = torch.constant.int 26
%2862 = torch.prims.convert_element_type %2861, %int26_2863 : !torch.vtensor<[1,?,8,128],bf16>, !torch.int -> !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %2862, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
%int64_2864 = torch.constant.int 64
%2863 = torch.aten.mul.Scalar %arg2, %int64_2864 : !torch.vtensor<[1,?],si64>, !torch.int -> !torch.vtensor<[1,?],si64>
torch.bind_symbolic_shape %2863, [%548], affine_map<()[s0] -> (1, s0)> : !torch.vtensor<[1,?],si64>
%int20 = torch.constant.int 20
%int1_2865 = torch.constant.int 1
%2864 = torch.aten.add.Scalar %2863, %int20, %int1_2865 : !torch.vtensor<[1,?],si64>, !torch.int, !torch.int -> !torch.vtensor<[1,?],si64>
torch.bind_symbolic_shape %2864, [%548], affine_map<()[s0] -> (1, s0)> : !torch.vtensor<[1,?],si64>
%int1_2866 = torch.constant.int 1
%int32_2867 = torch.constant.int 32
%int8_2868 = torch.constant.int 8
%int128_2869 = torch.constant.int 128
%2865 = torch.prim.ListConstruct %int1_2866, %670, %int32_2867, %int8_2868, %int128_2869 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2866 = torch.aten.view %2859, %2865 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %2866, [%548], affine_map<()[s0] -> (1, s0, 32, 8, 128)> : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
%int32_2870 = torch.constant.int 32
%int8_2871 = torch.constant.int 8
%int128_2872 = torch.constant.int 128
%2867 = torch.prim.ListConstruct %670, %int32_2870, %int8_2871, %int128_2872 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2868 = torch.aten.view %2866, %2867 : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %2868, [%548], affine_map<()[s0] -> (s0, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
%2869 = torch.prim.ListConstruct %670 : (!torch.int) -> !torch.list<int>
%2870 = torch.aten.view %2864, %2869 : !torch.vtensor<[1,?],si64>, !torch.list<int> -> !torch.vtensor<[?],si64>
torch.bind_symbolic_shape %2870, [%548], affine_map<()[s0] -> (s0)> : !torch.vtensor<[?],si64>
%int32_2873 = torch.constant.int 32
%int2_2874 = torch.constant.int 2
%int32_2875 = torch.constant.int 32
%int8_2876 = torch.constant.int 8
%int128_2877 = torch.constant.int 128
%2871 = torch.prim.ListConstruct %661, %int32_2873, %int2_2874, %int32_2875, %int8_2876, %int128_2877 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2872 = torch.aten.view %2679, %2871 : !torch.vtensor<[?,2097152],f16>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f16>
torch.bind_symbolic_shape %2872, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f16>
%int32_2878 = torch.constant.int 32
%2873 = torch.aten.mul.int %661, %int32_2878 : !torch.int, !torch.int -> !torch.int
%int2_2879 = torch.constant.int 2
%2874 = torch.aten.mul.int %2873, %int2_2879 : !torch.int, !torch.int -> !torch.int
%int32_2880 = torch.constant.int 32
%int8_2881 = torch.constant.int 8
%int128_2882 = torch.constant.int 128
%2875 = torch.prim.ListConstruct %2874, %int32_2880, %int8_2881, %int128_2882 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2876 = torch.aten.view %2872, %2875 : !torch.vtensor<[?,32,2,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f16>
torch.bind_symbolic_shape %2876, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f16>
%2877 = torch.prim.ListConstruct %2870 : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
%false_2883 = torch.constant.bool false
%2878 = torch.aten.index_put %2876, %2877, %2868, %false_2883 : !torch.vtensor<[?,32,8,128],f16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool -> !torch.vtensor<[?,32,8,128],f16>
torch.bind_symbolic_shape %2878, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f16>
%int32_2884 = torch.constant.int 32
%int2_2885 = torch.constant.int 2
%int32_2886 = torch.constant.int 32
%int8_2887 = torch.constant.int 8
%int128_2888 = torch.constant.int 128
%2879 = torch.prim.ListConstruct %661, %int32_2884, %int2_2885, %int32_2886, %int8_2887, %int128_2888 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2880 = torch.aten.view %2878, %2879 : !torch.vtensor<[?,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f16>
torch.bind_symbolic_shape %2880, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f16>
%int2097152_2889 = torch.constant.int 2097152
%2881 = torch.prim.ListConstruct %661, %int2097152_2889 : (!torch.int, !torch.int) -> !torch.list<int>
%2882 = torch.aten.view %2880, %2881 : !torch.vtensor<[?,32,2,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,2097152],f16>
torch.bind_symbolic_shape %2882, [%549], affine_map<()[s0] -> (s0, 2097152)> : !torch.vtensor<[?,2097152],f16>
%int32_2890 = torch.constant.int 32
%int2_2891 = torch.constant.int 2
%int32_2892 = torch.constant.int 32
%int8_2893 = torch.constant.int 8
%int128_2894 = torch.constant.int 128
%2883 = torch.prim.ListConstruct %661, %int32_2890, %int2_2891, %int32_2892, %int8_2893, %int128_2894 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2884 = torch.aten.view %2882, %2883 : !torch.vtensor<[?,2097152],f16>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f16>
torch.bind_symbolic_shape %2884, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f16>
%int32_2895 = torch.constant.int 32
%int8_2896 = torch.constant.int 8
%int128_2897 = torch.constant.int 128
%2885 = torch.prim.ListConstruct %2874, %int32_2895, %int8_2896, %int128_2897 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2886 = torch.aten.view %2884, %2885 : !torch.vtensor<[?,32,2,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f16>
torch.bind_symbolic_shape %2886, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f16>
%int1_2898 = torch.constant.int 1
%int32_2899 = torch.constant.int 32
%int8_2900 = torch.constant.int 8
%int128_2901 = torch.constant.int 128
%2887 = torch.prim.ListConstruct %int1_2898, %670, %int32_2899, %int8_2900, %int128_2901 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2888 = torch.aten.view %2862, %2887 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %2888, [%548], affine_map<()[s0] -> (1, s0, 32, 8, 128)> : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
%int32_2902 = torch.constant.int 32
%int8_2903 = torch.constant.int 8
%int128_2904 = torch.constant.int 128
%2889 = torch.prim.ListConstruct %670, %int32_2902, %int8_2903, %int128_2904 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2890 = torch.aten.view %2888, %2889 : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %2890, [%548], affine_map<()[s0] -> (s0, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
%int1_2905 = torch.constant.int 1
%int1_2906 = torch.constant.int 1
%2891 = torch.aten.add.Scalar %2864, %int1_2905, %int1_2906 : !torch.vtensor<[1,?],si64>, !torch.int, !torch.int -> !torch.vtensor<[1,?],si64>
torch.bind_symbolic_shape %2891, [%548], affine_map<()[s0] -> (1, s0)> : !torch.vtensor<[1,?],si64>
%2892 = torch.prim.ListConstruct %670 : (!torch.int) -> !torch.list<int>
%2893 = torch.aten.view %2891, %2892 : !torch.vtensor<[1,?],si64>, !torch.list<int> -> !torch.vtensor<[?],si64>
torch.bind_symbolic_shape %2893, [%548], affine_map<()[s0] -> (s0)> : !torch.vtensor<[?],si64>
%2894 = torch.prim.ListConstruct %2893 : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
%false_2907 = torch.constant.bool false
%2895 = torch.aten.index_put %2886, %2894, %2890, %false_2907 : !torch.vtensor<[?,32,8,128],f16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool -> !torch.vtensor<[?,32,8,128],f16>
torch.bind_symbolic_shape %2895, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f16>
%int32_2908 = torch.constant.int 32
%int2_2909 = torch.constant.int 2
%int32_2910 = torch.constant.int 32
%int8_2911 = torch.constant.int 8
%int128_2912 = torch.constant.int 128
%2896 = torch.prim.ListConstruct %661, %int32_2908, %int2_2909, %int32_2910, %int8_2911, %int128_2912 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2897 = torch.aten.view %2895, %2896 : !torch.vtensor<[?,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f16>
torch.bind_symbolic_shape %2897, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f16>
%int2097152_2913 = torch.constant.int 2097152
%2898 = torch.prim.ListConstruct %661, %int2097152_2913 : (!torch.int, !torch.int) -> !torch.list<int>
%2899 = torch.aten.view %2897, %2898 : !torch.vtensor<[?,32,2,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,2097152],f16>
torch.bind_symbolic_shape %2899, [%549], affine_map<()[s0] -> (s0, 2097152)> : !torch.vtensor<[?,2097152],f16>
%int-2_2914 = torch.constant.int -2
%2900 = torch.aten.unsqueeze %2859, %int-2_2914 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %2900, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 1, 128)> : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
%int1_2915 = torch.constant.int 1
%int8_2916 = torch.constant.int 8
%int4_2917 = torch.constant.int 4
%int128_2918 = torch.constant.int 128
%2901 = torch.prim.ListConstruct %int1_2915, %2841, %int8_2916, %int4_2917, %int128_2918 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%false_2919 = torch.constant.bool false
%2902 = torch.aten.expand %2900, %2901, %false_2919 : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %2902, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
%int0_2920 = torch.constant.int 0
%2903 = torch.aten.clone %2902, %int0_2920 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %2903, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
%int1_2921 = torch.constant.int 1
%int32_2922 = torch.constant.int 32
%int128_2923 = torch.constant.int 128
%2904 = torch.prim.ListConstruct %int1_2921, %2841, %int32_2922, %int128_2923 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2905 = torch.aten._unsafe_view %2903, %2904 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %2905, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
%int-2_2924 = torch.constant.int -2
%2906 = torch.aten.unsqueeze %2862, %int-2_2924 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %2906, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 1, 128)> : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
%int1_2925 = torch.constant.int 1
%2907 = torch.aten.size.int %2793, %int1_2925 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.int
%int1_2926 = torch.constant.int 1
%int8_2927 = torch.constant.int 8
%int4_2928 = torch.constant.int 4
%int128_2929 = torch.constant.int 128
%2908 = torch.prim.ListConstruct %int1_2926, %2907, %int8_2927, %int4_2928, %int128_2929 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%false_2930 = torch.constant.bool false
%2909 = torch.aten.expand %2906, %2908, %false_2930 : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %2909, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
%int0_2931 = torch.constant.int 0
%2910 = torch.aten.clone %2909, %int0_2931 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %2910, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
%int1_2932 = torch.constant.int 1
%int32_2933 = torch.constant.int 32
%int128_2934 = torch.constant.int 128
%2911 = torch.prim.ListConstruct %int1_2932, %2907, %int32_2933, %int128_2934 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2912 = torch.aten._unsafe_view %2910, %2911 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %2912, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
%int6_2935 = torch.constant.int 6
%2913 = torch.prims.convert_element_type %2905, %int6_2935 : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,32,128],f32>
torch.bind_symbolic_shape %2913, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
%2914 = torch.aten.mul.Tensor %2913, %178 : !torch.vtensor<[1,?,32,128],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,32,128],f32>
torch.bind_symbolic_shape %2914, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
%int15_2936 = torch.constant.int 15
%2915 = torch.prims.convert_element_type %2914, %int15_2936 : !torch.vtensor<[1,?,32,128],f32>, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
torch.bind_symbolic_shape %2915, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
%int6_2937 = torch.constant.int 6
%2916 = torch.prims.convert_element_type %2912, %int6_2937 : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,32,128],f32>
torch.bind_symbolic_shape %2916, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
%2917 = torch.aten.mul.Tensor %2916, %178 : !torch.vtensor<[1,?,32,128],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,32,128],f32>
torch.bind_symbolic_shape %2917, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
%int15_2938 = torch.constant.int 15
%2918 = torch.prims.convert_element_type %2917, %int15_2938 : !torch.vtensor<[1,?,32,128],f32>, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
torch.bind_symbolic_shape %2918, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
%int1_2939 = torch.constant.int 1
%int2_2940 = torch.constant.int 2
%2919 = torch.aten.transpose.int %2828, %int1_2939, %int2_2940 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
torch.bind_symbolic_shape %2919, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
%int1_2941 = torch.constant.int 1
%int2_2942 = torch.constant.int 2
%2920 = torch.aten.transpose.int %2915, %int1_2941, %int2_2942 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
torch.bind_symbolic_shape %2920, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
%int1_2943 = torch.constant.int 1
%int2_2944 = torch.constant.int 2
%2921 = torch.aten.transpose.int %2918, %int1_2943, %int2_2944 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,32,?,128],bf16>
torch.bind_symbolic_shape %2921, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
%float0.000000e00_2945 = torch.constant.float 0.000000e+00
%true_2946 = torch.constant.bool true
%none_2947 = torch.constant.none
%none_2948 = torch.constant.none
%2922:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%2919, %2920, %2921, %float0.000000e00_2945, %true_2946, %none_2947, %none_2948) : (!torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,32,?,128],bf16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,32,?,128],bf16>, !torch.vtensor<[1,32,?],f32>)
torch.bind_symbolic_shape %2922#0, [%548], affine_map<()[s0] -> (1, 32, s0 * 32, 128)> : !torch.vtensor<[1,32,?,128],bf16>
%int1_2949 = torch.constant.int 1
%int2_2950 = torch.constant.int 2
%2923 = torch.aten.transpose.int %2922#0, %int1_2949, %int2_2950 : !torch.vtensor<[1,32,?,128],bf16>, !torch.int, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
torch.bind_symbolic_shape %2923, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
%int1_2951 = torch.constant.int 1
%int4096_2952 = torch.constant.int 4096
%2924 = torch.prim.ListConstruct %int1_2951, %2813, %int4096_2952 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2925 = torch.aten.view %2923, %2924 : !torch.vtensor<[1,?,32,128],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,4096],bf16>
torch.bind_symbolic_shape %2925, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
%2926 = torch.aten.div.Tensor %2925, %179 : !torch.vtensor<[1,?,4096],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],bf16>
torch.bind_symbolic_shape %2926, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
%float-2.400000e02_2953 = torch.constant.float -2.400000e+02
%float2.400000e02_2954 = torch.constant.float 2.400000e+02
%2927 = torch.aten.clamp %2926, %float-2.400000e02_2953, %float2.400000e02_2954 : !torch.vtensor<[1,?,4096],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],bf16>
torch.bind_symbolic_shape %2927, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
%int26_2955 = torch.constant.int 26
%2928 = torch.prims.convert_element_type %2927, %int26_2955 : !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %2928, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
%int-2_2956 = torch.constant.int -2
%int-1_2957 = torch.constant.int -1
%2929 = torch.aten.transpose.int %180, %int-2_2956, %int-1_2957 : !torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%int4096_2958 = torch.constant.int 4096
%2930 = torch.prim.ListConstruct %2813, %int4096_2958 : (!torch.int, !torch.int) -> !torch.list<int>
%2931 = torch.aten.view %2928, %2930 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %2931, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
%2932 = torch.aten.mm %2931, %2929 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %2932, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
%int1_2959 = torch.constant.int 1
%int4096_2960 = torch.constant.int 4096
%2933 = torch.prim.ListConstruct %int1_2959, %2813, %int4096_2960 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2934 = torch.aten.view %2932, %2933 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %2934, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
%int15_2961 = torch.constant.int 15
%2935 = torch.prims.convert_element_type %2934, %int15_2961 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
torch.bind_symbolic_shape %2935, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
%int1_2962 = torch.constant.int 1
%2936 = torch.aten.add.Tensor %2757, %2935, %int1_2962 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %2936, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%int2_2963 = torch.constant.int 2
%2937 = torch.aten.pow.Tensor_Scalar %2936, %int2_2963 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %2937, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%int-1_2964 = torch.constant.int -1
%2938 = torch.prim.ListConstruct %int-1_2964 : (!torch.int) -> !torch.list<int>
%true_2965 = torch.constant.bool true
%none_2966 = torch.constant.none
%2939 = torch.aten.mean.dim %2937, %2938, %true_2965, %none_2966 : !torch.vtensor<[1,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,?,1],f32>
torch.bind_symbolic_shape %2939, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
%float1.000000e-05_2967 = torch.constant.float 1.000000e-05
%int1_2968 = torch.constant.int 1
%2940 = torch.aten.add.Scalar %2939, %float1.000000e-05_2967, %int1_2968 : !torch.vtensor<[1,?,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,?,1],f32>
torch.bind_symbolic_shape %2940, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
%2941 = torch.aten.rsqrt %2940 : !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,1],f32>
torch.bind_symbolic_shape %2941, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
%2942 = torch.aten.mul.Tensor %2936, %2941 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %2942, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%2943 = torch.aten.mul.Tensor %181, %2942 : !torch.vtensor<[4096],bf16>, !torch.vtensor<[1,?,4096],f32> -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %2943, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%2944 = torch.aten.div.Tensor %2943, %182 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %2944, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%float-2.400000e02_2969 = torch.constant.float -2.400000e+02
%float2.400000e02_2970 = torch.constant.float 2.400000e+02
%2945 = torch.aten.clamp %2944, %float-2.400000e02_2969, %float2.400000e02_2970 : !torch.vtensor<[1,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %2945, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%int26_2971 = torch.constant.int 26
%2946 = torch.prims.convert_element_type %2945, %int26_2971 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %2946, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
%int-2_2972 = torch.constant.int -2
%int-1_2973 = torch.constant.int -1
%2947 = torch.aten.transpose.int %183, %int-2_2972, %int-1_2973 : !torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
%int4096_2974 = torch.constant.int 4096
%2948 = torch.prim.ListConstruct %566, %int4096_2974 : (!torch.int, !torch.int) -> !torch.list<int>
%2949 = torch.aten.view %2946, %2948 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %2949, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
%2950 = torch.aten.mm %2949, %2947 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,14336],f8E4M3FNUZ> -> !torch.vtensor<[?,14336],f8E4M3FNUZ>
torch.bind_symbolic_shape %2950, [%548], affine_map<()[s0] -> (s0 * 32, 14336)> : !torch.vtensor<[?,14336],f8E4M3FNUZ>
%int1_2975 = torch.constant.int 1
%int14336_2976 = torch.constant.int 14336
%2951 = torch.prim.ListConstruct %int1_2975, %566, %int14336_2976 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2952 = torch.aten.view %2950, %2951 : !torch.vtensor<[?,14336],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
torch.bind_symbolic_shape %2952, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
%int15_2977 = torch.constant.int 15
%2953 = torch.prims.convert_element_type %2952, %int15_2977 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,14336],bf16>
torch.bind_symbolic_shape %2953, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
%2954 = torch.aten.silu %2953 : !torch.vtensor<[1,?,14336],bf16> -> !torch.vtensor<[1,?,14336],bf16>
torch.bind_symbolic_shape %2954, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
%2955 = torch.aten.div.Tensor %2943, %184 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %2955, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%float-2.400000e02_2978 = torch.constant.float -2.400000e+02
%float2.400000e02_2979 = torch.constant.float 2.400000e+02
%2956 = torch.aten.clamp %2955, %float-2.400000e02_2978, %float2.400000e02_2979 : !torch.vtensor<[1,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %2956, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%int26_2980 = torch.constant.int 26
%2957 = torch.prims.convert_element_type %2956, %int26_2980 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %2957, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
%int-2_2981 = torch.constant.int -2
%int-1_2982 = torch.constant.int -1
%2958 = torch.aten.transpose.int %185, %int-2_2981, %int-1_2982 : !torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
%int4096_2983 = torch.constant.int 4096
%2959 = torch.prim.ListConstruct %566, %int4096_2983 : (!torch.int, !torch.int) -> !torch.list<int>
%2960 = torch.aten.view %2957, %2959 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %2960, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
%2961 = torch.aten.mm %2960, %2958 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,14336],f8E4M3FNUZ> -> !torch.vtensor<[?,14336],f8E4M3FNUZ>
torch.bind_symbolic_shape %2961, [%548], affine_map<()[s0] -> (s0 * 32, 14336)> : !torch.vtensor<[?,14336],f8E4M3FNUZ>
%int1_2984 = torch.constant.int 1
%int14336_2985 = torch.constant.int 14336
%2962 = torch.prim.ListConstruct %int1_2984, %566, %int14336_2985 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2963 = torch.aten.view %2961, %2962 : !torch.vtensor<[?,14336],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
torch.bind_symbolic_shape %2963, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
%int15_2986 = torch.constant.int 15
%2964 = torch.prims.convert_element_type %2963, %int15_2986 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,14336],bf16>
torch.bind_symbolic_shape %2964, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
%2965 = torch.aten.mul.Tensor %2954, %2964 : !torch.vtensor<[1,?,14336],bf16>, !torch.vtensor<[1,?,14336],bf16> -> !torch.vtensor<[1,?,14336],bf16>
torch.bind_symbolic_shape %2965, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
%2966 = torch.aten.div.Tensor %2965, %186 : !torch.vtensor<[1,?,14336],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,14336],bf16>
torch.bind_symbolic_shape %2966, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
%float-2.400000e02_2987 = torch.constant.float -2.400000e+02
%float2.400000e02_2988 = torch.constant.float 2.400000e+02
%2967 = torch.aten.clamp %2966, %float-2.400000e02_2987, %float2.400000e02_2988 : !torch.vtensor<[1,?,14336],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,14336],bf16>
torch.bind_symbolic_shape %2967, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],bf16>
%int26_2989 = torch.constant.int 26
%2968 = torch.prims.convert_element_type %2967, %int26_2989 : !torch.vtensor<[1,?,14336],bf16>, !torch.int -> !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
torch.bind_symbolic_shape %2968, [%548], affine_map<()[s0] -> (1, s0 * 32, 14336)> : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>
%int-2_2990 = torch.constant.int -2
%int-1_2991 = torch.constant.int -1
%2969 = torch.aten.transpose.int %187, %int-2_2990, %int-1_2991 : !torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%int1_2992 = torch.constant.int 1
%2970 = torch.aten.size.int %2952, %int1_2992 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.int -> !torch.int
%int14336_2993 = torch.constant.int 14336
%2971 = torch.prim.ListConstruct %2970, %int14336_2993 : (!torch.int, !torch.int) -> !torch.list<int>
%2972 = torch.aten.view %2968, %2971 : !torch.vtensor<[1,?,14336],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,14336],f8E4M3FNUZ>
torch.bind_symbolic_shape %2972, [%548], affine_map<()[s0] -> (s0 * 32, 14336)> : !torch.vtensor<[?,14336],f8E4M3FNUZ>
%2973 = torch.aten.mm %2972, %2969 : !torch.vtensor<[?,14336],f8E4M3FNUZ>, !torch.vtensor<[14336,4096],f8E4M3FNUZ> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %2973, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
%int1_2994 = torch.constant.int 1
%int4096_2995 = torch.constant.int 4096
%2974 = torch.prim.ListConstruct %int1_2994, %2970, %int4096_2995 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2975 = torch.aten.view %2973, %2974 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %2975, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
%int15_2996 = torch.constant.int 15
%2976 = torch.prims.convert_element_type %2975, %int15_2996 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
torch.bind_symbolic_shape %2976, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
%int1_2997 = torch.constant.int 1
%2977 = torch.aten.add.Tensor %2936, %2976, %int1_2997 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %2977, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%int2_2998 = torch.constant.int 2
%2978 = torch.aten.pow.Tensor_Scalar %2977, %int2_2998 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %2978, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%int-1_2999 = torch.constant.int -1
%2979 = torch.prim.ListConstruct %int-1_2999 : (!torch.int) -> !torch.list<int>
%true_3000 = torch.constant.bool true
%none_3001 = torch.constant.none
%2980 = torch.aten.mean.dim %2978, %2979, %true_3000, %none_3001 : !torch.vtensor<[1,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,?,1],f32>
torch.bind_symbolic_shape %2980, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
%float1.000000e-05_3002 = torch.constant.float 1.000000e-05
%int1_3003 = torch.constant.int 1
%2981 = torch.aten.add.Scalar %2980, %float1.000000e-05_3002, %int1_3003 : !torch.vtensor<[1,?,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,?,1],f32>
torch.bind_symbolic_shape %2981, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
%2982 = torch.aten.rsqrt %2981 : !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,1],f32>
torch.bind_symbolic_shape %2982, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
%2983 = torch.aten.mul.Tensor %2977, %2982 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %2983, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%2984 = torch.aten.mul.Tensor %188, %2983 : !torch.vtensor<[4096],bf16>, !torch.vtensor<[1,?,4096],f32> -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %2984, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%2985 = torch.aten.div.Tensor %2984, %189 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %2985, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%float-2.400000e02_3004 = torch.constant.float -2.400000e+02
%float2.400000e02_3005 = torch.constant.float 2.400000e+02
%2986 = torch.aten.clamp %2985, %float-2.400000e02_3004, %float2.400000e02_3005 : !torch.vtensor<[1,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %2986, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%int26_3006 = torch.constant.int 26
%2987 = torch.prims.convert_element_type %2986, %int26_3006 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %2987, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
%int-2_3007 = torch.constant.int -2
%int-1_3008 = torch.constant.int -1
%2988 = torch.aten.transpose.int %190, %int-2_3007, %int-1_3008 : !torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%int4096_3009 = torch.constant.int 4096
%2989 = torch.prim.ListConstruct %566, %int4096_3009 : (!torch.int, !torch.int) -> !torch.list<int>
%2990 = torch.aten.view %2987, %2989 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %2990, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
%2991 = torch.aten.mm %2990, %2988 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,4096],f8E4M3FNUZ> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %2991, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
%int1_3010 = torch.constant.int 1
%int4096_3011 = torch.constant.int 4096
%2992 = torch.prim.ListConstruct %int1_3010, %566, %int4096_3011 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%2993 = torch.aten.view %2991, %2992 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %2993, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
%int15_3012 = torch.constant.int 15
%2994 = torch.prims.convert_element_type %2993, %int15_3012 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
torch.bind_symbolic_shape %2994, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
%2995 = torch.aten.div.Tensor %2984, %191 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %2995, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%float-2.400000e02_3013 = torch.constant.float -2.400000e+02
%float2.400000e02_3014 = torch.constant.float 2.400000e+02
%2996 = torch.aten.clamp %2995, %float-2.400000e02_3013, %float2.400000e02_3014 : !torch.vtensor<[1,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %2996, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%int26_3015 = torch.constant.int 26
%2997 = torch.prims.convert_element_type %2996, %int26_3015 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %2997, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
%int-2_3016 = torch.constant.int -2
%int-1_3017 = torch.constant.int -1
%2998 = torch.aten.transpose.int %192, %int-2_3016, %int-1_3017 : !torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,1024],f8E4M3FNUZ>
%int4096_3018 = torch.constant.int 4096
%2999 = torch.prim.ListConstruct %566, %int4096_3018 : (!torch.int, !torch.int) -> !torch.list<int>
%3000 = torch.aten.view %2997, %2999 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %3000, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
%3001 = torch.aten.mm %3000, %2998 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,1024],f8E4M3FNUZ> -> !torch.vtensor<[?,1024],f8E4M3FNUZ>
torch.bind_symbolic_shape %3001, [%548], affine_map<()[s0] -> (s0 * 32, 1024)> : !torch.vtensor<[?,1024],f8E4M3FNUZ>
%int1_3019 = torch.constant.int 1
%int1024_3020 = torch.constant.int 1024
%3002 = torch.prim.ListConstruct %int1_3019, %566, %int1024_3020 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%3003 = torch.aten.view %3001, %3002 : !torch.vtensor<[?,1024],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
torch.bind_symbolic_shape %3003, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
%int15_3021 = torch.constant.int 15
%3004 = torch.prims.convert_element_type %3003, %int15_3021 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,1024],bf16>
torch.bind_symbolic_shape %3004, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],bf16>
%3005 = torch.aten.div.Tensor %2984, %193 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %3005, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%float-2.400000e02_3022 = torch.constant.float -2.400000e+02
%float2.400000e02_3023 = torch.constant.float 2.400000e+02
%3006 = torch.aten.clamp %3005, %float-2.400000e02_3022, %float2.400000e02_3023 : !torch.vtensor<[1,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %3006, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%int26_3024 = torch.constant.int 26
%3007 = torch.prims.convert_element_type %3006, %int26_3024 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %3007, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>
%int-2_3025 = torch.constant.int -2
%int-1_3026 = torch.constant.int -1
%3008 = torch.aten.transpose.int %194, %int-2_3025, %int-1_3026 : !torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4096,1024],f8E4M3FNUZ>
%int4096_3027 = torch.constant.int 4096
%3009 = torch.prim.ListConstruct %566, %int4096_3027 : (!torch.int, !torch.int) -> !torch.list<int>
%3010 = torch.aten.view %3007, %3009 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,4096],f8E4M3FNUZ>
torch.bind_symbolic_shape %3010, [%548], affine_map<()[s0] -> (s0 * 32, 4096)> : !torch.vtensor<[?,4096],f8E4M3FNUZ>
%3011 = torch.aten.mm %3010, %3008 : !torch.vtensor<[?,4096],f8E4M3FNUZ>, !torch.vtensor<[4096,1024],f8E4M3FNUZ> -> !torch.vtensor<[?,1024],f8E4M3FNUZ>
torch.bind_symbolic_shape %3011, [%548], affine_map<()[s0] -> (s0 * 32, 1024)> : !torch.vtensor<[?,1024],f8E4M3FNUZ>
%int1_3028 = torch.constant.int 1
%int1024_3029 = torch.constant.int 1024
%3012 = torch.prim.ListConstruct %int1_3028, %566, %int1024_3029 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%3013 = torch.aten.view %3011, %3012 : !torch.vtensor<[?,1024],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
torch.bind_symbolic_shape %3013, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>
%int15_3030 = torch.constant.int 15
%3014 = torch.prims.convert_element_type %3013, %int15_3030 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,1024],bf16>
torch.bind_symbolic_shape %3014, [%548], affine_map<()[s0] -> (1, s0 * 32, 1024)> : !torch.vtensor<[1,?,1024],bf16>
%int1_3031 = torch.constant.int 1
%int32_3032 = torch.constant.int 32
%int128_3033 = torch.constant.int 128
%3015 = torch.prim.ListConstruct %int1_3031, %566, %int32_3032, %int128_3033 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%3016 = torch.aten.view %2994, %3015 : !torch.vtensor<[1,?,4096],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,32,128],bf16>
torch.bind_symbolic_shape %3016, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
%int1_3034 = torch.constant.int 1
%int8_3035 = torch.constant.int 8
%int128_3036 = torch.constant.int 128
%3017 = torch.prim.ListConstruct %int1_3034, %566, %int8_3035, %int128_3036 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%3018 = torch.aten.view %3004, %3017 : !torch.vtensor<[1,?,1024],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,8,128],bf16>
torch.bind_symbolic_shape %3018, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
%int1_3037 = torch.constant.int 1
%int8_3038 = torch.constant.int 8
%int128_3039 = torch.constant.int 128
%3019 = torch.prim.ListConstruct %int1_3037, %566, %int8_3038, %int128_3039 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%3020 = torch.aten.view %3014, %3019 : !torch.vtensor<[1,?,1024],bf16>, !torch.list<int> -> !torch.vtensor<[1,?,8,128],bf16>
torch.bind_symbolic_shape %3020, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
%int131072_3040 = torch.constant.int 131072
%none_3041 = torch.constant.none
%none_3042 = torch.constant.none
%cpu_3043 = torch.constant.device "cpu"
%false_3044 = torch.constant.bool false
%3021 = torch.aten.arange %int131072_3040, %none_3041, %none_3042, %cpu_3043, %false_3044 : !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[131072],si64>
%int0_3045 = torch.constant.int 0
%int128_3046 = torch.constant.int 128
%none_3047 = torch.constant.none
%none_3048 = torch.constant.none
%cpu_3049 = torch.constant.device "cpu"
%false_3050 = torch.constant.bool false
%3022 = torch.aten.arange.start %int0_3045, %int128_3046, %none_3047, %none_3048, %cpu_3049, %false_3050 : !torch.int, !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[128],si64>
%int2_3051 = torch.constant.int 2
%3023 = torch.aten.floor_divide.Scalar %3022, %int2_3051 : !torch.vtensor<[128],si64>, !torch.int -> !torch.vtensor<[128],si64>
%int6_3052 = torch.constant.int 6
%3024 = torch.prims.convert_element_type %3023, %int6_3052 : !torch.vtensor<[128],si64>, !torch.int -> !torch.vtensor<[128],f32>
%int128_3053 = torch.constant.int 128
%3025 = torch.aten.div.Scalar %3024, %int128_3053 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[128],f32>
%float2.000000e00_3054 = torch.constant.float 2.000000e+00
%3026 = torch.aten.mul.Scalar %3025, %float2.000000e00_3054 : !torch.vtensor<[128],f32>, !torch.float -> !torch.vtensor<[128],f32>
%float5.000000e05_3055 = torch.constant.float 5.000000e+05
%3027 = torch.aten.pow.Scalar %float5.000000e05_3055, %3026 : !torch.float, !torch.vtensor<[128],f32> -> !torch.vtensor<[128],f32>
%3028 = torch.aten.reciprocal %3027 : !torch.vtensor<[128],f32> -> !torch.vtensor<[128],f32>
%float1.000000e00_3056 = torch.constant.float 1.000000e+00
%3029 = torch.aten.mul.Scalar %3028, %float1.000000e00_3056 : !torch.vtensor<[128],f32>, !torch.float -> !torch.vtensor<[128],f32>
%int131072_3057 = torch.constant.int 131072
%int1_3058 = torch.constant.int 1
%3030 = torch.prim.ListConstruct %int131072_3057, %int1_3058 : (!torch.int, !torch.int) -> !torch.list<int>
%3031 = torch.aten.view %3021, %3030 : !torch.vtensor<[131072],si64>, !torch.list<int> -> !torch.vtensor<[131072,1],si64>
%3032 = torch.aten.mul.Tensor %3031, %3029 : !torch.vtensor<[131072,1],si64>, !torch.vtensor<[128],f32> -> !torch.vtensor<[131072,128],f32>
%int1_3059 = torch.constant.int 1
%3033 = torch.aten.size.int %2993, %int1_3059 : !torch.vtensor<[1,?,4096],f8E4M3FNUZ>, !torch.int -> !torch.int
%int0_3060 = torch.constant.int 0
%3034 = torch.aten.add.int %int0_3060, %3033 : !torch.int, !torch.int -> !torch.int
%int0_3061 = torch.constant.int 0
%int0_3062 = torch.constant.int 0
%int1_3063 = torch.constant.int 1
%3035 = torch.aten.slice.Tensor %3032, %int0_3061, %int0_3062, %3034, %int1_3063 : !torch.vtensor<[131072,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
torch.bind_symbolic_shape %3035, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
%int1_3064 = torch.constant.int 1
%int0_3065 = torch.constant.int 0
%int9223372036854775807_3066 = torch.constant.int 9223372036854775807
%int1_3067 = torch.constant.int 1
%3036 = torch.aten.slice.Tensor %3035, %int1_3064, %int0_3065, %int9223372036854775807_3066, %int1_3067 : !torch.vtensor<[?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
torch.bind_symbolic_shape %3036, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
%int1_3068 = torch.constant.int 1
%int0_3069 = torch.constant.int 0
%int9223372036854775807_3070 = torch.constant.int 9223372036854775807
%int1_3071 = torch.constant.int 1
%3037 = torch.aten.slice.Tensor %3036, %int1_3068, %int0_3069, %int9223372036854775807_3070, %int1_3071 : !torch.vtensor<[?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
torch.bind_symbolic_shape %3037, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
%int0_3072 = torch.constant.int 0
%3038 = torch.aten.unsqueeze %3037, %int0_3072 : !torch.vtensor<[?,128],f32>, !torch.int -> !torch.vtensor<[1,?,128],f32>
torch.bind_symbolic_shape %3038, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
%int1_3073 = torch.constant.int 1
%int0_3074 = torch.constant.int 0
%int9223372036854775807_3075 = torch.constant.int 9223372036854775807
%int1_3076 = torch.constant.int 1
%3039 = torch.aten.slice.Tensor %3038, %int1_3073, %int0_3074, %int9223372036854775807_3075, %int1_3076 : !torch.vtensor<[1,?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],f32>
torch.bind_symbolic_shape %3039, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
%int2_3077 = torch.constant.int 2
%int0_3078 = torch.constant.int 0
%int9223372036854775807_3079 = torch.constant.int 9223372036854775807
%int1_3080 = torch.constant.int 1
%3040 = torch.aten.slice.Tensor %3039, %int2_3077, %int0_3078, %int9223372036854775807_3079, %int1_3080 : !torch.vtensor<[1,?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],f32>
torch.bind_symbolic_shape %3040, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
%int1_3081 = torch.constant.int 1
%int1_3082 = torch.constant.int 1
%int1_3083 = torch.constant.int 1
%3041 = torch.prim.ListConstruct %int1_3081, %int1_3082, %int1_3083 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%3042 = torch.aten.repeat %3040, %3041 : !torch.vtensor<[1,?,128],f32>, !torch.list<int> -> !torch.vtensor<[1,?,128],f32>
torch.bind_symbolic_shape %3042, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
%int6_3084 = torch.constant.int 6
%3043 = torch.prims.convert_element_type %3016, %int6_3084 : !torch.vtensor<[1,?,32,128],bf16>, !torch.int -> !torch.vtensor<[1,?,32,128],f32>
torch.bind_symbolic_shape %3043, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
%3044 = torch_c.to_builtin_tensor %3043 : !torch.vtensor<[1,?,32,128],f32> -> tensor<1x?x32x128xf32>
%3045 = torch_c.to_builtin_tensor %3042 : !torch.vtensor<[1,?,128],f32> -> tensor<1x?x128xf32>
%3046 = util.call @sharktank_rotary_embedding_1_D_32_128_f32(%3044, %3045) : (tensor<1x?x32x128xf32>, tensor<1x?x128xf32>) -> tensor<1x?x32x128xf32>
%3047 = torch_c.from_builtin_tensor %3046 : tensor<1x?x32x128xf32> -> !torch.vtensor<[1,?,32,128],f32>
torch.bind_symbolic_shape %3047, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
%int15_3085 = torch.constant.int 15
%3048 = torch.prims.convert_element_type %3047, %int15_3085 : !torch.vtensor<[1,?,32,128],f32>, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
torch.bind_symbolic_shape %3048, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
%int131072_3086 = torch.constant.int 131072
%none_3087 = torch.constant.none
%none_3088 = torch.constant.none
%cpu_3089 = torch.constant.device "cpu"
%false_3090 = torch.constant.bool false
%3049 = torch.aten.arange %int131072_3086, %none_3087, %none_3088, %cpu_3089, %false_3090 : !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[131072],si64>
%int0_3091 = torch.constant.int 0
%int128_3092 = torch.constant.int 128
%none_3093 = torch.constant.none
%none_3094 = torch.constant.none
%cpu_3095 = torch.constant.device "cpu"
%false_3096 = torch.constant.bool false
%3050 = torch.aten.arange.start %int0_3091, %int128_3092, %none_3093, %none_3094, %cpu_3095, %false_3096 : !torch.int, !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[128],si64>
%int2_3097 = torch.constant.int 2
%3051 = torch.aten.floor_divide.Scalar %3050, %int2_3097 : !torch.vtensor<[128],si64>, !torch.int -> !torch.vtensor<[128],si64>
%int6_3098 = torch.constant.int 6
%3052 = torch.prims.convert_element_type %3051, %int6_3098 : !torch.vtensor<[128],si64>, !torch.int -> !torch.vtensor<[128],f32>
%int128_3099 = torch.constant.int 128
%3053 = torch.aten.div.Scalar %3052, %int128_3099 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[128],f32>
%float2.000000e00_3100 = torch.constant.float 2.000000e+00
%3054 = torch.aten.mul.Scalar %3053, %float2.000000e00_3100 : !torch.vtensor<[128],f32>, !torch.float -> !torch.vtensor<[128],f32>
%float5.000000e05_3101 = torch.constant.float 5.000000e+05
%3055 = torch.aten.pow.Scalar %float5.000000e05_3101, %3054 : !torch.float, !torch.vtensor<[128],f32> -> !torch.vtensor<[128],f32>
%3056 = torch.aten.reciprocal %3055 : !torch.vtensor<[128],f32> -> !torch.vtensor<[128],f32>
%float1.000000e00_3102 = torch.constant.float 1.000000e+00
%3057 = torch.aten.mul.Scalar %3056, %float1.000000e00_3102 : !torch.vtensor<[128],f32>, !torch.float -> !torch.vtensor<[128],f32>
%int131072_3103 = torch.constant.int 131072
%int1_3104 = torch.constant.int 1
%3058 = torch.prim.ListConstruct %int131072_3103, %int1_3104 : (!torch.int, !torch.int) -> !torch.list<int>
%3059 = torch.aten.view %3049, %3058 : !torch.vtensor<[131072],si64>, !torch.list<int> -> !torch.vtensor<[131072,1],si64>
%3060 = torch.aten.mul.Tensor %3059, %3057 : !torch.vtensor<[131072,1],si64>, !torch.vtensor<[128],f32> -> !torch.vtensor<[131072,128],f32>
%int1_3105 = torch.constant.int 1
%3061 = torch.aten.size.int %3003, %int1_3105 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.int
%int0_3106 = torch.constant.int 0
%3062 = torch.aten.add.int %int0_3106, %3061 : !torch.int, !torch.int -> !torch.int
%int0_3107 = torch.constant.int 0
%int0_3108 = torch.constant.int 0
%int1_3109 = torch.constant.int 1
%3063 = torch.aten.slice.Tensor %3060, %int0_3107, %int0_3108, %3062, %int1_3109 : !torch.vtensor<[131072,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
torch.bind_symbolic_shape %3063, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
%int1_3110 = torch.constant.int 1
%int0_3111 = torch.constant.int 0
%int9223372036854775807_3112 = torch.constant.int 9223372036854775807
%int1_3113 = torch.constant.int 1
%3064 = torch.aten.slice.Tensor %3063, %int1_3110, %int0_3111, %int9223372036854775807_3112, %int1_3113 : !torch.vtensor<[?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
torch.bind_symbolic_shape %3064, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
%int1_3114 = torch.constant.int 1
%int0_3115 = torch.constant.int 0
%int9223372036854775807_3116 = torch.constant.int 9223372036854775807
%int1_3117 = torch.constant.int 1
%3065 = torch.aten.slice.Tensor %3064, %int1_3114, %int0_3115, %int9223372036854775807_3116, %int1_3117 : !torch.vtensor<[?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],f32>
torch.bind_symbolic_shape %3065, [%548], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],f32>
%int0_3118 = torch.constant.int 0
%3066 = torch.aten.unsqueeze %3065, %int0_3118 : !torch.vtensor<[?,128],f32>, !torch.int -> !torch.vtensor<[1,?,128],f32>
torch.bind_symbolic_shape %3066, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
%int1_3119 = torch.constant.int 1
%int0_3120 = torch.constant.int 0
%int9223372036854775807_3121 = torch.constant.int 9223372036854775807
%int1_3122 = torch.constant.int 1
%3067 = torch.aten.slice.Tensor %3066, %int1_3119, %int0_3120, %int9223372036854775807_3121, %int1_3122 : !torch.vtensor<[1,?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],f32>
torch.bind_symbolic_shape %3067, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
%int2_3123 = torch.constant.int 2
%int0_3124 = torch.constant.int 0
%int9223372036854775807_3125 = torch.constant.int 9223372036854775807
%int1_3126 = torch.constant.int 1
%3068 = torch.aten.slice.Tensor %3067, %int2_3123, %int0_3124, %int9223372036854775807_3125, %int1_3126 : !torch.vtensor<[1,?,128],f32>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],f32>
torch.bind_symbolic_shape %3068, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
%int1_3127 = torch.constant.int 1
%int1_3128 = torch.constant.int 1
%int1_3129 = torch.constant.int 1
%3069 = torch.prim.ListConstruct %int1_3127, %int1_3128, %int1_3129 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
%3070 = torch.aten.repeat %3068, %3069 : !torch.vtensor<[1,?,128],f32>, !torch.list<int> -> !torch.vtensor<[1,?,128],f32>
torch.bind_symbolic_shape %3070, [%548], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],f32>
%int6_3130 = torch.constant.int 6
%3071 = torch.prims.convert_element_type %3018, %int6_3130 : !torch.vtensor<[1,?,8,128],bf16>, !torch.int -> !torch.vtensor<[1,?,8,128],f32>
torch.bind_symbolic_shape %3071, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f32>
%3072 = torch_c.to_builtin_tensor %3071 : !torch.vtensor<[1,?,8,128],f32> -> tensor<1x?x8x128xf32>
%3073 = torch_c.to_builtin_tensor %3070 : !torch.vtensor<[1,?,128],f32> -> tensor<1x?x128xf32>
%3074 = util.call @sharktank_rotary_embedding_1_D_8_128_f32(%3072, %3073) : (tensor<1x?x8x128xf32>, tensor<1x?x128xf32>) -> tensor<1x?x8x128xf32>
%3075 = torch_c.from_builtin_tensor %3074 : tensor<1x?x8x128xf32> -> !torch.vtensor<[1,?,8,128],f32>
torch.bind_symbolic_shape %3075, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f32>
%int15_3131 = torch.constant.int 15
%3076 = torch.prims.convert_element_type %3075, %int15_3131 : !torch.vtensor<[1,?,8,128],f32>, !torch.int -> !torch.vtensor<[1,?,8,128],bf16>
torch.bind_symbolic_shape %3076, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
%3077 = torch.aten.div.Tensor %3076, %195 : !torch.vtensor<[1,?,8,128],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,8,128],bf16>
torch.bind_symbolic_shape %3077, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
%float-2.400000e02_3132 = torch.constant.float -2.400000e+02
%float2.400000e02_3133 = torch.constant.float 2.400000e+02
%3078 = torch.aten.clamp %3077, %float-2.400000e02_3132, %float2.400000e02_3133 : !torch.vtensor<[1,?,8,128],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,8,128],bf16>
torch.bind_symbolic_shape %3078, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
%int26_3134 = torch.constant.int 26
%3079 = torch.prims.convert_element_type %3078, %int26_3134 : !torch.vtensor<[1,?,8,128],bf16>, !torch.int -> !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %3079, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
%3080 = torch.aten.div.Tensor %3020, %195 : !torch.vtensor<[1,?,8,128],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,8,128],bf16>
torch.bind_symbolic_shape %3080, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
%float-2.400000e02_3135 = torch.constant.float -2.400000e+02
%float2.400000e02_3136 = torch.constant.float 2.400000e+02
%3081 = torch.aten.clamp %3080, %float-2.400000e02_3135, %float2.400000e02_3136 : !torch.vtensor<[1,?,8,128],bf16>, !torch.float, !torch.float -> !torch.vtensor<[1,?,8,128],bf16>
torch.bind_symbolic_shape %3081, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],bf16>
%int26_3137 = torch.constant.int 26
%3082 = torch.prims.convert_element_type %3081, %int26_3137 : !torch.vtensor<[1,?,8,128],bf16>, !torch.int -> !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %3082, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 128)> : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>
%int64_3138 = torch.constant.int 64
%3083 = torch.aten.mul.Scalar %arg2, %int64_3138 : !torch.vtensor<[1,?],si64>, !torch.int -> !torch.vtensor<[1,?],si64>
torch.bind_symbolic_shape %3083, [%548], affine_map<()[s0] -> (1, s0)> : !torch.vtensor<[1,?],si64>
%int22 = torch.constant.int 22
%int1_3139 = torch.constant.int 1
%3084 = torch.aten.add.Scalar %3083, %int22, %int1_3139 : !torch.vtensor<[1,?],si64>, !torch.int, !torch.int -> !torch.vtensor<[1,?],si64>
torch.bind_symbolic_shape %3084, [%548], affine_map<()[s0] -> (1, s0)> : !torch.vtensor<[1,?],si64>
%int1_3140 = torch.constant.int 1
%int32_3141 = torch.constant.int 32
%int8_3142 = torch.constant.int 8
%int128_3143 = torch.constant.int 128
%3085 = torch.prim.ListConstruct %int1_3140, %670, %int32_3141, %int8_3142, %int128_3143 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%3086 = torch.aten.view %3079, %3085 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %3086, [%548], affine_map<()[s0] -> (1, s0, 32, 8, 128)> : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
%int32_3144 = torch.constant.int 32
%int8_3145 = torch.constant.int 8
%int128_3146 = torch.constant.int 128
%3087 = torch.prim.ListConstruct %670, %int32_3144, %int8_3145, %int128_3146 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%3088 = torch.aten.view %3086, %3087 : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %3088, [%548], affine_map<()[s0] -> (s0, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
%3089 = torch.prim.ListConstruct %670 : (!torch.int) -> !torch.list<int>
%3090 = torch.aten.view %3084, %3089 : !torch.vtensor<[1,?],si64>, !torch.list<int> -> !torch.vtensor<[?],si64>
torch.bind_symbolic_shape %3090, [%548], affine_map<()[s0] -> (s0)> : !torch.vtensor<[?],si64>
%int32_3147 = torch.constant.int 32
%int2_3148 = torch.constant.int 2
%int32_3149 = torch.constant.int 32
%int8_3150 = torch.constant.int 8
%int128_3151 = torch.constant.int 128
%3091 = torch.prim.ListConstruct %661, %int32_3147, %int2_3148, %int32_3149, %int8_3150, %int128_3151 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%3092 = torch.aten.view %2899, %3091 : !torch.vtensor<[?,2097152],f16>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f16>
torch.bind_symbolic_shape %3092, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f16>
%int32_3152 = torch.constant.int 32
%3093 = torch.aten.mul.int %661, %int32_3152 : !torch.int, !torch.int -> !torch.int
%int2_3153 = torch.constant.int 2
%3094 = torch.aten.mul.int %3093, %int2_3153 : !torch.int, !torch.int -> !torch.int
%int32_3154 = torch.constant.int 32
%int8_3155 = torch.constant.int 8
%int128_3156 = torch.constant.int 128
%3095 = torch.prim.ListConstruct %3094, %int32_3154, %int8_3155, %int128_3156 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%3096 = torch.aten.view %3092, %3095 : !torch.vtensor<[?,32,2,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f16>
torch.bind_symbolic_shape %3096, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f16>
%3097 = torch.prim.ListConstruct %3090 : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
%false_3157 = torch.constant.bool false
%3098 = torch.aten.index_put %3096, %3097, %3088, %false_3157 : !torch.vtensor<[?,32,8,128],f16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool -> !torch.vtensor<[?,32,8,128],f16>
torch.bind_symbolic_shape %3098, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f16>
%int32_3158 = torch.constant.int 32
%int2_3159 = torch.constant.int 2
%int32_3160 = torch.constant.int 32
%int8_3161 = torch.constant.int 8
%int128_3162 = torch.constant.int 128
%3099 = torch.prim.ListConstruct %661, %int32_3158, %int2_3159, %int32_3160, %int8_3161, %int128_3162 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%3100 = torch.aten.view %3098, %3099 : !torch.vtensor<[?,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f16>
torch.bind_symbolic_shape %3100, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f16>
%int2097152_3163 = torch.constant.int 2097152
%3101 = torch.prim.ListConstruct %661, %int2097152_3163 : (!torch.int, !torch.int) -> !torch.list<int>
%3102 = torch.aten.view %3100, %3101 : !torch.vtensor<[?,32,2,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,2097152],f16>
torch.bind_symbolic_shape %3102, [%549], affine_map<()[s0] -> (s0, 2097152)> : !torch.vtensor<[?,2097152],f16>
%int32_3164 = torch.constant.int 32
%int2_3165 = torch.constant.int 2
%int32_3166 = torch.constant.int 32
%int8_3167 = torch.constant.int 8
%int128_3168 = torch.constant.int 128
%3103 = torch.prim.ListConstruct %661, %int32_3164, %int2_3165, %int32_3166, %int8_3167, %int128_3168 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%3104 = torch.aten.view %3102, %3103 : !torch.vtensor<[?,2097152],f16>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f16>
torch.bind_symbolic_shape %3104, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f16>
%int32_3169 = torch.constant.int 32
%int8_3170 = torch.constant.int 8
%int128_3171 = torch.constant.int 128
%3105 = torch.prim.ListConstruct %3094, %int32_3169, %int8_3170, %int128_3171 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%3106 = torch.aten.view %3104, %3105 : !torch.vtensor<[?,32,2,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f16>
torch.bind_symbolic_shape %3106, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f16>
%int1_3172 = torch.constant.int 1
%int32_3173 = torch.constant.int 32
%int8_3174 = torch.constant.int 8
%int128_3175 = torch.constant.int 128
%3107 = torch.prim.ListConstruct %int1_3172, %670, %int32_3173, %int8_3174, %int128_3175 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%3108 = torch.aten.view %3082, %3107 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %3108, [%548], affine_map<()[s0] -> (1, s0, 32, 8, 128)> : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>
%int32_3176 = torch.constant.int 32
%int8_3177 = torch.constant.int 8
%int128_3178 = torch.constant.int 128
%3109 = torch.prim.ListConstruct %670, %int32_3176, %int8_3177, %int128_3178 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%3110 = torch.aten.view %3108, %3109 : !torch.vtensor<[1,?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %3110, [%548], affine_map<()[s0] -> (s0, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>
%int1_3179 = torch.constant.int 1
%int1_3180 = torch.constant.int 1
%3111 = torch.aten.add.Scalar %3084, %int1_3179, %int1_3180 : !torch.vtensor<[1,?],si64>, !torch.int, !torch.int -> !torch.vtensor<[1,?],si64>
torch.bind_symbolic_shape %3111, [%548], affine_map<()[s0] -> (1, s0)> : !torch.vtensor<[1,?],si64>
%3112 = torch.prim.ListConstruct %670 : (!torch.int) -> !torch.list<int>
%3113 = torch.aten.view %3111, %3112 : !torch.vtensor<[1,?],si64>, !torch.list<int> -> !torch.vtensor<[?],si64>
torch.bind_symbolic_shape %3113, [%548], affine_map<()[s0] -> (s0)> : !torch.vtensor<[?],si64>
%3114 = torch.prim.ListConstruct %3113 : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>>
%false_3181 = torch.constant.bool false
%3115 = torch.aten.index_put %3106, %3114, %3110, %false_3181 : !torch.vtensor<[?,32,8,128],f16>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.bool -> !torch.vtensor<[?,32,8,128],f16>
torch.bind_symbolic_shape %3115, [%549], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f16>
%int32_3182 = torch.constant.int 32
%int2_3183 = torch.constant.int 2
%int32_3184 = torch.constant.int 32
%int8_3185 = torch.constant.int 8
%int128_3186 = torch.constant.int 128
%3116 = torch.prim.ListConstruct %661, %int32_3182, %int2_3183, %int32_3184, %int8_3185, %int128_3186 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%3117 = torch.aten.view %3115, %3116 : !torch.vtensor<[?,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f16>
torch.bind_symbolic_shape %3117, [%549], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f16>
%int2097152_3187 = torch.constant.int 2097152
%3118 = torch.prim.ListConstruct %661, %int2097152_3187 : (!torch.int, !torch.int) -> !torch.list<int>
%3119 = torch.aten.view %3117, %3118 : !torch.vtensor<[?,32,2,32,8,128],f16>, !torch.list<int> -> !torch.vtensor<[?,2097152],f16>
torch.bind_symbolic_shape %3119, [%549], affine_map<()[s0] -> (s0, 2097152)> : !torch.vtensor<[?,2097152],f16>
%int-2_3188 = torch.constant.int -2
%3120 = torch.aten.unsqueeze %3079, %int-2_3188 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %3120, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 1, 128)> : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
%int1_3189 = torch.constant.int 1
%int8_3190 = torch.constant.int 8
%int4_3191 = torch.constant.int 4
%int128_3192 = torch.constant.int 128
%3121 = torch.prim.ListConstruct %int1_3189, %3061, %int8_3190, %int4_3191, %int128_3192 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%false_3193 = torch.constant.bool false
%3122 = torch.aten.expand %3120, %3121, %false_3193 : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %3122, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
%int0_3194 = torch.constant.int 0
%3123 = torch.aten.clone %3122, %int0_3194 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %3123, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
%int1_3195 = torch.constant.int 1
%int32_3196 = torch.constant.int 32
%int128_3197 = torch.constant.int 128
%3124 = torch.prim.ListConstruct %int1_3195, %3061, %int32_3196, %int128_3197 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%3125 = torch.aten._unsafe_view %3123, %3124 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %3125, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
%int-2_3198 = torch.constant.int -2
%3126 = torch.aten.unsqueeze %3082, %int-2_3198 : !torch.vtensor<[1,?,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %3126, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 1, 128)> : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>
%int1_3199 = torch.constant.int 1
%3127 = torch.aten.size.int %3013, %int1_3199 : !torch.vtensor<[1,?,1024],f8E4M3FNUZ>, !torch.int -> !torch.int
%int1_3200 = torch.constant.int 1
%int8_3201 = torch.constant.int 8
%int4_3202 = torch.constant.int 4
%int128_3203 = torch.constant.int 128
%3128 = torch.prim.ListConstruct %int1_3200, %3127, %int8_3201, %int4_3202, %int128_3203 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%false_3204 = torch.constant.bool false
%3129 = torch.aten.expand %3126, %3128, %false_3204 : !torch.vtensor<[1,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %3129, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
%int0_3205 = torch.constant.int 0
%3130 = torch.aten.clone %3129, %int0_3205 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %3130, [%548], affine_map<()[s0] -> (1, s0 * 32, 8, 4, 128)> : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>
%int1_3206 = torch.constant.int 1
%int32_3207 = torch.constant.int 32
%int128_3208 = torch.constant.int 128
%3131 = torch.prim.ListConstruct %int1_3206, %3127, %int32_3207, %int128_3208 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
%3132 = torch.aten._unsafe_view %3130, %3131 : !torch.vtensor<[1,?,8,4,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
torch.bind_symbolic_shape %3132, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>
%int6_3209 = torch.constant.int 6
%3133 = torch.prims.convert_element_type %3125, %int6_3209 : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,32,128],f32>
torch.bind_symbolic_shape %3133, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
%3134 = torch.aten.mul.Tensor %3133, %195 : !torch.vtensor<[1,?,32,128],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,32,128],f32>
torch.bind_symbolic_shape %3134, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
%int15_3210 = torch.constant.int 15
%3135 = torch.prims.convert_element_type %3134, %int15_3210 : !torch.vtensor<[1,?,32,128],f32>, !torch.int -> !torch.vtensor<[1,?,32,128],bf16>
torch.bind_symbolic_shape %3135, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],bf16>
%int6_3211 = torch.constant.int 6
%3136 = torch.prims.convert_element_type %3132, %int6_3211 : !torch.vtensor<[1,?,32,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,?,32,128],f32>
torch.bind_symbolic_shape %3136, [%548], affine_map<()[s0] -> (1, s0 * 32, 32, 128)> : !torch.vtensor<[1,?,32,128],f32>
%3137 = torch.aten.mul.Tensor %3136, %195 : !torch.vten
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment