Skip to content

Instantly share code, notes, and snippets.

@pashu123
Created January 31, 2025 19:42
Show Gist options
  • Save pashu123/c9ee6885d403065f91b3004f23221d7d to your computer and use it in GitHub Desktop.
Save pashu123/c9ee6885d403065f91b3004f23221d7d to your computer and use it in GitHub Desktop.
module @module {
util.global private @__auto.token_embd.weight = #stream.parameter.named<"model"::"token_embd.weight"> : tensor<128256x4096xbf16>
util.global private @__auto.blk.0.attn_norm.weight = #stream.parameter.named<"model"::"blk.0.attn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.0.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.0.attn_q.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.0.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.0.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.0.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.0.attn_k.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.0.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.0.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.0.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.0.attn_v.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.0.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.0.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.0.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.0.kv_cache.quantizer:rscale"> : tensor<f32>
util.global private @"__auto.blk.0.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.0.attn_output.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.0.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.0.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @__auto.blk.0.ffn_norm.weight = #stream.parameter.named<"model"::"blk.0.ffn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.0.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.0.ffn_gate.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.0.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.0.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.0.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.0.ffn_up.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.0.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.0.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.0.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.0.ffn_down.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.0.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.0.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
util.global private @__auto.blk.1.attn_norm.weight = #stream.parameter.named<"model"::"blk.1.attn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.1.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.1.attn_q.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.1.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.1.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.1.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.1.attn_k.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.1.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.1.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.1.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.1.attn_v.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.1.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.1.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.1.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.1.kv_cache.quantizer:rscale"> : tensor<f32>
util.global private @"__auto.blk.1.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.1.attn_output.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.1.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.1.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @__auto.blk.1.ffn_norm.weight = #stream.parameter.named<"model"::"blk.1.ffn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.1.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.1.ffn_gate.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.1.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.1.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.1.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.1.ffn_up.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.1.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.1.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.1.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.1.ffn_down.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.1.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.1.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
util.global private @__auto.blk.2.attn_norm.weight = #stream.parameter.named<"model"::"blk.2.attn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.2.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.2.attn_q.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.2.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.2.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.2.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.2.attn_k.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.2.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.2.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.2.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.2.attn_v.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.2.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.2.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.2.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.2.kv_cache.quantizer:rscale"> : tensor<f32>
util.global private @"__auto.blk.2.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.2.attn_output.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.2.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.2.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @__auto.blk.2.ffn_norm.weight = #stream.parameter.named<"model"::"blk.2.ffn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.2.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.2.ffn_gate.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.2.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.2.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.2.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.2.ffn_up.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.2.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.2.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.2.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.2.ffn_down.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.2.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.2.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
util.global private @__auto.blk.3.attn_norm.weight = #stream.parameter.named<"model"::"blk.3.attn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.3.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.3.attn_q.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.3.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.3.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.3.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.3.attn_k.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.3.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.3.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.3.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.3.attn_v.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.3.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.3.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.3.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.3.kv_cache.quantizer:rscale"> : tensor<f32>
util.global private @"__auto.blk.3.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.3.attn_output.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.3.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.3.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @__auto.blk.3.ffn_norm.weight = #stream.parameter.named<"model"::"blk.3.ffn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.3.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.3.ffn_gate.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.3.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.3.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.3.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.3.ffn_up.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.3.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.3.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.3.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.3.ffn_down.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.3.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.3.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
util.global private @__auto.blk.4.attn_norm.weight = #stream.parameter.named<"model"::"blk.4.attn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.4.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.4.attn_q.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.4.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.4.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.4.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.4.attn_k.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.4.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.4.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.4.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.4.attn_v.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.4.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.4.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.4.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.4.kv_cache.quantizer:rscale"> : tensor<f32>
util.global private @"__auto.blk.4.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.4.attn_output.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.4.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.4.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @__auto.blk.4.ffn_norm.weight = #stream.parameter.named<"model"::"blk.4.ffn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.4.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.4.ffn_gate.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.4.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.4.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.4.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.4.ffn_up.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.4.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.4.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.4.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.4.ffn_down.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.4.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.4.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
util.global private @__auto.blk.5.attn_norm.weight = #stream.parameter.named<"model"::"blk.5.attn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.5.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.5.attn_q.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.5.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.5.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.5.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.5.attn_k.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.5.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.5.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.5.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.5.attn_v.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.5.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.5.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.5.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.5.kv_cache.quantizer:rscale"> : tensor<f32>
util.global private @"__auto.blk.5.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.5.attn_output.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.5.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.5.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @__auto.blk.5.ffn_norm.weight = #stream.parameter.named<"model"::"blk.5.ffn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.5.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.5.ffn_gate.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.5.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.5.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.5.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.5.ffn_up.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.5.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.5.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.5.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.5.ffn_down.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.5.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.5.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
util.global private @__auto.blk.6.attn_norm.weight = #stream.parameter.named<"model"::"blk.6.attn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.6.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.6.attn_q.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.6.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.6.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.6.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.6.attn_k.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.6.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.6.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.6.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.6.attn_v.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.6.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.6.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.6.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.6.kv_cache.quantizer:rscale"> : tensor<f32>
util.global private @"__auto.blk.6.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.6.attn_output.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.6.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.6.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @__auto.blk.6.ffn_norm.weight = #stream.parameter.named<"model"::"blk.6.ffn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.6.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.6.ffn_gate.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.6.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.6.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.6.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.6.ffn_up.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.6.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.6.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.6.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.6.ffn_down.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.6.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.6.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
util.global private @__auto.blk.7.attn_norm.weight = #stream.parameter.named<"model"::"blk.7.attn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.7.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.7.attn_q.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.7.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.7.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.7.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.7.attn_k.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.7.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.7.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.7.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.7.attn_v.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.7.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.7.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.7.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.7.kv_cache.quantizer:rscale"> : tensor<f32>
util.global private @"__auto.blk.7.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.7.attn_output.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.7.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.7.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @__auto.blk.7.ffn_norm.weight = #stream.parameter.named<"model"::"blk.7.ffn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.7.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.7.ffn_gate.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.7.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.7.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.7.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.7.ffn_up.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.7.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.7.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.7.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.7.ffn_down.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.7.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.7.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
util.global private @__auto.blk.8.attn_norm.weight = #stream.parameter.named<"model"::"blk.8.attn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.8.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.8.attn_q.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.8.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.8.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.8.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.8.attn_k.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.8.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.8.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.8.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.8.attn_v.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.8.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.8.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.8.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.8.kv_cache.quantizer:rscale"> : tensor<f32>
util.global private @"__auto.blk.8.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.8.attn_output.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.8.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.8.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @__auto.blk.8.ffn_norm.weight = #stream.parameter.named<"model"::"blk.8.ffn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.8.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.8.ffn_gate.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.8.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.8.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.8.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.8.ffn_up.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.8.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.8.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.8.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.8.ffn_down.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.8.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.8.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
util.global private @__auto.blk.9.attn_norm.weight = #stream.parameter.named<"model"::"blk.9.attn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.9.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.9.attn_q.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.9.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.9.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.9.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.9.attn_k.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.9.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.9.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.9.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.9.attn_v.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.9.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.9.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.9.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.9.kv_cache.quantizer:rscale"> : tensor<f32>
util.global private @"__auto.blk.9.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.9.attn_output.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.9.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.9.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @__auto.blk.9.ffn_norm.weight = #stream.parameter.named<"model"::"blk.9.ffn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.9.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.9.ffn_gate.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.9.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.9.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.9.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.9.ffn_up.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.9.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.9.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.9.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.9.ffn_down.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.9.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.9.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
util.global private @__auto.blk.10.attn_norm.weight = #stream.parameter.named<"model"::"blk.10.attn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.10.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.10.attn_q.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.10.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.10.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.10.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.10.attn_k.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.10.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.10.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.10.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.10.attn_v.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.10.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.10.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.10.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.10.kv_cache.quantizer:rscale"> : tensor<f32>
util.global private @"__auto.blk.10.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.10.attn_output.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.10.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.10.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @__auto.blk.10.ffn_norm.weight = #stream.parameter.named<"model"::"blk.10.ffn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.10.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.10.ffn_gate.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.10.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.10.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.10.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.10.ffn_up.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.10.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.10.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.10.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.10.ffn_down.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.10.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.10.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
util.global private @__auto.blk.11.attn_norm.weight = #stream.parameter.named<"model"::"blk.11.attn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.11.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.11.attn_q.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.11.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.11.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.11.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.11.attn_k.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.11.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.11.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.11.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.11.attn_v.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.11.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.11.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.11.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.11.kv_cache.quantizer:rscale"> : tensor<f32>
util.global private @"__auto.blk.11.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.11.attn_output.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.11.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.11.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @__auto.blk.11.ffn_norm.weight = #stream.parameter.named<"model"::"blk.11.ffn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.11.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.11.ffn_gate.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.11.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.11.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.11.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.11.ffn_up.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.11.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.11.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.11.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.11.ffn_down.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.11.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.11.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
util.global private @__auto.blk.12.attn_norm.weight = #stream.parameter.named<"model"::"blk.12.attn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.12.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.12.attn_q.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.12.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.12.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.12.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.12.attn_k.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.12.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.12.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.12.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.12.attn_v.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.12.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.12.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.12.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.12.kv_cache.quantizer:rscale"> : tensor<f32>
util.global private @"__auto.blk.12.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.12.attn_output.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.12.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.12.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @__auto.blk.12.ffn_norm.weight = #stream.parameter.named<"model"::"blk.12.ffn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.12.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.12.ffn_gate.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.12.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.12.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.12.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.12.ffn_up.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.12.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.12.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.12.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.12.ffn_down.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.12.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.12.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
util.global private @__auto.blk.13.attn_norm.weight = #stream.parameter.named<"model"::"blk.13.attn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.13.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.13.attn_q.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.13.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.13.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.13.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.13.attn_k.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.13.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.13.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.13.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.13.attn_v.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.13.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.13.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.13.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.13.kv_cache.quantizer:rscale"> : tensor<f32>
util.global private @"__auto.blk.13.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.13.attn_output.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.13.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.13.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @__auto.blk.13.ffn_norm.weight = #stream.parameter.named<"model"::"blk.13.ffn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.13.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.13.ffn_gate.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.13.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.13.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.13.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.13.ffn_up.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.13.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.13.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.13.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.13.ffn_down.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.13.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.13.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
util.global private @__auto.blk.14.attn_norm.weight = #stream.parameter.named<"model"::"blk.14.attn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.14.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.14.attn_q.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.14.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.14.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.14.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.14.attn_k.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.14.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.14.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.14.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.14.attn_v.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.14.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.14.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.14.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.14.kv_cache.quantizer:rscale"> : tensor<f32>
util.global private @"__auto.blk.14.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.14.attn_output.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.14.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.14.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @__auto.blk.14.ffn_norm.weight = #stream.parameter.named<"model"::"blk.14.ffn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.14.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.14.ffn_gate.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.14.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.14.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.14.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.14.ffn_up.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.14.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.14.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.14.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.14.ffn_down.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.14.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.14.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
util.global private @__auto.blk.15.attn_norm.weight = #stream.parameter.named<"model"::"blk.15.attn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.15.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.15.attn_q.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.15.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.15.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.15.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.15.attn_k.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.15.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.15.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.15.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.15.attn_v.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.15.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.15.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.15.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.15.kv_cache.quantizer:rscale"> : tensor<f32>
util.global private @"__auto.blk.15.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.15.attn_output.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.15.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.15.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @__auto.blk.15.ffn_norm.weight = #stream.parameter.named<"model"::"blk.15.ffn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.15.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.15.ffn_gate.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.15.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.15.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.15.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.15.ffn_up.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.15.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.15.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.15.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.15.ffn_down.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.15.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.15.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
util.global private @__auto.blk.16.attn_norm.weight = #stream.parameter.named<"model"::"blk.16.attn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.16.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.16.attn_q.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.16.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.16.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.16.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.16.attn_k.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.16.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.16.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.16.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.16.attn_v.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.16.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.16.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.16.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.16.kv_cache.quantizer:rscale"> : tensor<f32>
util.global private @"__auto.blk.16.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.16.attn_output.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.16.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.16.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @__auto.blk.16.ffn_norm.weight = #stream.parameter.named<"model"::"blk.16.ffn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.16.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.16.ffn_gate.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.16.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.16.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.16.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.16.ffn_up.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.16.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.16.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.16.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.16.ffn_down.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.16.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.16.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
util.global private @__auto.blk.17.attn_norm.weight = #stream.parameter.named<"model"::"blk.17.attn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.17.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.17.attn_q.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.17.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.17.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.17.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.17.attn_k.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.17.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.17.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.17.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.17.attn_v.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.17.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.17.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.17.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.17.kv_cache.quantizer:rscale"> : tensor<f32>
util.global private @"__auto.blk.17.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.17.attn_output.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.17.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.17.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @__auto.blk.17.ffn_norm.weight = #stream.parameter.named<"model"::"blk.17.ffn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.17.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.17.ffn_gate.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.17.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.17.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.17.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.17.ffn_up.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.17.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.17.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.17.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.17.ffn_down.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.17.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.17.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
util.global private @__auto.blk.18.attn_norm.weight = #stream.parameter.named<"model"::"blk.18.attn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.18.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.18.attn_q.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.18.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.18.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.18.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.18.attn_k.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.18.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.18.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.18.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.18.attn_v.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.18.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.18.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.18.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.18.kv_cache.quantizer:rscale"> : tensor<f32>
util.global private @"__auto.blk.18.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.18.attn_output.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.18.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.18.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @__auto.blk.18.ffn_norm.weight = #stream.parameter.named<"model"::"blk.18.ffn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.18.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.18.ffn_gate.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.18.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.18.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.18.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.18.ffn_up.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.18.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.18.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.18.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.18.ffn_down.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.18.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.18.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
util.global private @__auto.blk.19.attn_norm.weight = #stream.parameter.named<"model"::"blk.19.attn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.19.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.19.attn_q.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.19.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.19.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.19.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.19.attn_k.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.19.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.19.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.19.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.19.attn_v.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.19.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.19.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.19.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.19.kv_cache.quantizer:rscale"> : tensor<f32>
util.global private @"__auto.blk.19.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.19.attn_output.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.19.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.19.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @__auto.blk.19.ffn_norm.weight = #stream.parameter.named<"model"::"blk.19.ffn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.19.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.19.ffn_gate.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.19.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.19.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.19.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.19.ffn_up.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.19.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.19.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.19.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.19.ffn_down.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.19.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.19.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
util.global private @__auto.blk.20.attn_norm.weight = #stream.parameter.named<"model"::"blk.20.attn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.20.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.20.attn_q.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.20.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.20.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.20.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.20.attn_k.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.20.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.20.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.20.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.20.attn_v.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.20.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.20.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.20.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.20.kv_cache.quantizer:rscale"> : tensor<f32>
util.global private @"__auto.blk.20.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.20.attn_output.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.20.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.20.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @__auto.blk.20.ffn_norm.weight = #stream.parameter.named<"model"::"blk.20.ffn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.20.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.20.ffn_gate.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.20.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.20.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.20.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.20.ffn_up.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.20.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.20.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.20.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.20.ffn_down.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.20.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.20.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
util.global private @__auto.blk.21.attn_norm.weight = #stream.parameter.named<"model"::"blk.21.attn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.21.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.21.attn_q.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.21.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.21.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.21.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.21.attn_k.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.21.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.21.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.21.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.21.attn_v.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.21.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.21.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.21.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.21.kv_cache.quantizer:rscale"> : tensor<f32>
util.global private @"__auto.blk.21.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.21.attn_output.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.21.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.21.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @__auto.blk.21.ffn_norm.weight = #stream.parameter.named<"model"::"blk.21.ffn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.21.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.21.ffn_gate.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.21.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.21.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.21.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.21.ffn_up.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.21.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.21.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.21.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.21.ffn_down.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.21.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.21.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
util.global private @__auto.blk.22.attn_norm.weight = #stream.parameter.named<"model"::"blk.22.attn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.22.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.22.attn_q.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.22.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.22.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.22.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.22.attn_k.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.22.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.22.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.22.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.22.attn_v.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.22.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.22.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.22.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.22.kv_cache.quantizer:rscale"> : tensor<f32>
util.global private @"__auto.blk.22.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.22.attn_output.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.22.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.22.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @__auto.blk.22.ffn_norm.weight = #stream.parameter.named<"model"::"blk.22.ffn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.22.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.22.ffn_gate.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.22.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.22.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.22.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.22.ffn_up.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.22.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.22.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.22.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.22.ffn_down.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.22.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.22.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
util.global private @__auto.blk.23.attn_norm.weight = #stream.parameter.named<"model"::"blk.23.attn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.23.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.23.attn_q.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.23.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.23.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.23.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.23.attn_k.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.23.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.23.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.23.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.23.attn_v.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.23.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.23.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.23.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.23.kv_cache.quantizer:rscale"> : tensor<f32>
util.global private @"__auto.blk.23.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.23.attn_output.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.23.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.23.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @__auto.blk.23.ffn_norm.weight = #stream.parameter.named<"model"::"blk.23.ffn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.23.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.23.ffn_gate.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.23.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.23.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.23.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.23.ffn_up.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.23.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.23.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.23.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.23.ffn_down.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.23.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.23.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
util.global private @__auto.blk.24.attn_norm.weight = #stream.parameter.named<"model"::"blk.24.attn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.24.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.24.attn_q.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.24.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.24.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.24.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.24.attn_k.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.24.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.24.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.24.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.24.attn_v.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.24.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.24.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.24.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.24.kv_cache.quantizer:rscale"> : tensor<f32>
util.global private @"__auto.blk.24.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.24.attn_output.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.24.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.24.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @__auto.blk.24.ffn_norm.weight = #stream.parameter.named<"model"::"blk.24.ffn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.24.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.24.ffn_gate.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.24.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.24.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.24.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.24.ffn_up.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.24.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.24.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.24.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.24.ffn_down.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.24.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.24.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
util.global private @__auto.blk.25.attn_norm.weight = #stream.parameter.named<"model"::"blk.25.attn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.25.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.25.attn_q.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.25.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.25.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.25.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.25.attn_k.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.25.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.25.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.25.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.25.attn_v.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.25.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.25.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.25.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.25.kv_cache.quantizer:rscale"> : tensor<f32>
util.global private @"__auto.blk.25.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.25.attn_output.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.25.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.25.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @__auto.blk.25.ffn_norm.weight = #stream.parameter.named<"model"::"blk.25.ffn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.25.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.25.ffn_gate.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.25.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.25.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.25.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.25.ffn_up.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.25.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.25.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.25.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.25.ffn_down.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.25.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.25.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
util.global private @__auto.blk.26.attn_norm.weight = #stream.parameter.named<"model"::"blk.26.attn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.26.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.26.attn_q.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.26.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.26.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.26.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.26.attn_k.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.26.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.26.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.26.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.26.attn_v.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.26.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.26.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.26.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.26.kv_cache.quantizer:rscale"> : tensor<f32>
util.global private @"__auto.blk.26.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.26.attn_output.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.26.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.26.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @__auto.blk.26.ffn_norm.weight = #stream.parameter.named<"model"::"blk.26.ffn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.26.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.26.ffn_gate.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.26.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.26.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.26.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.26.ffn_up.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.26.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.26.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.26.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.26.ffn_down.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.26.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.26.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
util.global private @__auto.blk.27.attn_norm.weight = #stream.parameter.named<"model"::"blk.27.attn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.27.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.27.attn_q.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.27.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.27.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.27.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.27.attn_k.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.27.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.27.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.27.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.27.attn_v.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.27.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.27.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.27.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.27.kv_cache.quantizer:rscale"> : tensor<f32>
util.global private @"__auto.blk.27.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.27.attn_output.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.27.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.27.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @__auto.blk.27.ffn_norm.weight = #stream.parameter.named<"model"::"blk.27.ffn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.27.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.27.ffn_gate.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.27.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.27.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.27.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.27.ffn_up.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.27.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.27.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.27.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.27.ffn_down.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.27.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.27.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
util.global private @__auto.blk.28.attn_norm.weight = #stream.parameter.named<"model"::"blk.28.attn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.28.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.28.attn_q.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.28.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.28.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.28.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.28.attn_k.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.28.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.28.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.28.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.28.attn_v.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.28.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.28.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.28.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.28.kv_cache.quantizer:rscale"> : tensor<f32>
util.global private @"__auto.blk.28.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.28.attn_output.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.28.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.28.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @__auto.blk.28.ffn_norm.weight = #stream.parameter.named<"model"::"blk.28.ffn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.28.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.28.ffn_gate.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.28.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.28.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.28.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.28.ffn_up.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.28.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.28.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.28.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.28.ffn_down.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.28.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.28.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
util.global private @__auto.blk.29.attn_norm.weight = #stream.parameter.named<"model"::"blk.29.attn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.29.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.29.attn_q.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.29.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.29.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.29.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.29.attn_k.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.29.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.29.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.29.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.29.attn_v.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.29.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.29.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.29.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.29.kv_cache.quantizer:rscale"> : tensor<f32>
util.global private @"__auto.blk.29.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.29.attn_output.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.29.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.29.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @__auto.blk.29.ffn_norm.weight = #stream.parameter.named<"model"::"blk.29.ffn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.29.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.29.ffn_gate.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.29.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.29.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.29.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.29.ffn_up.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.29.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.29.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.29.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.29.ffn_down.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.29.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.29.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
util.global private @__auto.blk.30.attn_norm.weight = #stream.parameter.named<"model"::"blk.30.attn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.30.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.30.attn_q.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.30.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.30.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.30.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.30.attn_k.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.30.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.30.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.30.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.30.attn_v.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.30.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.30.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.30.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.30.kv_cache.quantizer:rscale"> : tensor<f32>
util.global private @"__auto.blk.30.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.30.attn_output.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.30.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.30.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @__auto.blk.30.ffn_norm.weight = #stream.parameter.named<"model"::"blk.30.ffn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.30.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.30.ffn_gate.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.30.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.30.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.30.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.30.ffn_up.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.30.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.30.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.30.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.30.ffn_down.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.30.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.30.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
util.global private @__auto.blk.31.attn_norm.weight = #stream.parameter.named<"model"::"blk.31.attn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.31.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.31.attn_q.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.31.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.31.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.31.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.31.attn_k.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.31.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.31.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.31.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.31.attn_v.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.31.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.31.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.31.kv_cache.quantizer:rscale" = #stream.parameter.named<"model"::"blk.31.kv_cache.quantizer:rscale"> : tensor<f32>
util.global private @"__auto.blk.31.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.31.attn_output.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.31.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.31.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ>
util.global private @__auto.blk.31.ffn_norm.weight = #stream.parameter.named<"model"::"blk.31.ffn_norm.weight"> : tensor<4096xbf16>
util.global private @"__auto.blk.31.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.31.ffn_gate.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.31.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.31.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.31.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.31.ffn_up.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.31.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.31.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ>
util.global private @"__auto.blk.31.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.31.ffn_down.q_input:rscale"> : tensor<f32>
util.global private @"__auto.blk.31.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.31.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ>
util.global private @__auto.output_norm.weight = #stream.parameter.named<"model"::"output_norm.weight"> : tensor<4096xbf16>
util.global private @__auto.output.weight = #stream.parameter.named<"model"::"output.weight"> : tensor<128256x4096xbf16>
func.func @prefill_bs1(%arg0: !torch.vtensor<[1,?],si64>, %arg1: !torch.vtensor<[1],si64>, %arg2: !torch.vtensor<[1,?],si64>, %arg3: !torch.tensor<[?,2097152],f8E4M3FNUZ>) -> !torch.vtensor<[128256,4096],bf16> attributes {torch.assume_strict_symbolic_shapes} {
%__auto.token_embd.weight = util.global.load @__auto.token_embd.weight : tensor<128256x4096xbf16>
%0 = torch_c.from_builtin_tensor %__auto.token_embd.weight : tensor<128256x4096xbf16> -> !torch.vtensor<[128256,4096],bf16>
%__auto.blk.0.attn_norm.weight = util.global.load @__auto.blk.0.attn_norm.weight : tensor<4096xbf16>
%1 = torch_c.from_builtin_tensor %__auto.blk.0.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.0.attn_q.q_input3Arscale = util.global.load @"__auto.blk.0.attn_q.q_input:rscale" : tensor<f32>
%2 = torch_c.from_builtin_tensor %__auto.blk.0.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.0.attn_q.weight3Aqs = util.global.load @"__auto.blk.0.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%3 = torch_c.from_builtin_tensor %__auto.blk.0.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.0.attn_k.q_input3Arscale = util.global.load @"__auto.blk.0.attn_k.q_input:rscale" : tensor<f32>
%4 = torch_c.from_builtin_tensor %__auto.blk.0.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.0.attn_k.weight3Aqs = util.global.load @"__auto.blk.0.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%5 = torch_c.from_builtin_tensor %__auto.blk.0.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.0.attn_v.q_input3Arscale = util.global.load @"__auto.blk.0.attn_v.q_input:rscale" : tensor<f32>
%6 = torch_c.from_builtin_tensor %__auto.blk.0.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.0.attn_v.weight3Aqs = util.global.load @"__auto.blk.0.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%7 = torch_c.from_builtin_tensor %__auto.blk.0.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.0.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.0.kv_cache.quantizer:rscale" : tensor<f32>
%8 = torch_c.from_builtin_tensor %__auto.blk.0.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.0.attn_output.q_input3Arscale = util.global.load @"__auto.blk.0.attn_output.q_input:rscale" : tensor<f32>
%9 = torch_c.from_builtin_tensor %__auto.blk.0.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.0.attn_output.weight3Aqs = util.global.load @"__auto.blk.0.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%10 = torch_c.from_builtin_tensor %__auto.blk.0.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.0.ffn_norm.weight = util.global.load @__auto.blk.0.ffn_norm.weight : tensor<4096xbf16>
%11 = torch_c.from_builtin_tensor %__auto.blk.0.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.0.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.0.ffn_gate.q_input:rscale" : tensor<f32>
%12 = torch_c.from_builtin_tensor %__auto.blk.0.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.0.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.0.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%13 = torch_c.from_builtin_tensor %__auto.blk.0.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.0.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.0.ffn_up.q_input:rscale" : tensor<f32>
%14 = torch_c.from_builtin_tensor %__auto.blk.0.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.0.ffn_up.weight3Aqs = util.global.load @"__auto.blk.0.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%15 = torch_c.from_builtin_tensor %__auto.blk.0.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.0.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.0.ffn_down.q_input:rscale" : tensor<f32>
%16 = torch_c.from_builtin_tensor %__auto.blk.0.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.0.ffn_down.weight3Aqs = util.global.load @"__auto.blk.0.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
%17 = torch_c.from_builtin_tensor %__auto.blk.0.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
%__auto.blk.1.attn_norm.weight = util.global.load @__auto.blk.1.attn_norm.weight : tensor<4096xbf16>
%18 = torch_c.from_builtin_tensor %__auto.blk.1.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.1.attn_q.q_input3Arscale = util.global.load @"__auto.blk.1.attn_q.q_input:rscale" : tensor<f32>
%19 = torch_c.from_builtin_tensor %__auto.blk.1.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.1.attn_q.weight3Aqs = util.global.load @"__auto.blk.1.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%20 = torch_c.from_builtin_tensor %__auto.blk.1.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.1.attn_k.q_input3Arscale = util.global.load @"__auto.blk.1.attn_k.q_input:rscale" : tensor<f32>
%21 = torch_c.from_builtin_tensor %__auto.blk.1.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.1.attn_k.weight3Aqs = util.global.load @"__auto.blk.1.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%22 = torch_c.from_builtin_tensor %__auto.blk.1.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.1.attn_v.q_input3Arscale = util.global.load @"__auto.blk.1.attn_v.q_input:rscale" : tensor<f32>
%23 = torch_c.from_builtin_tensor %__auto.blk.1.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.1.attn_v.weight3Aqs = util.global.load @"__auto.blk.1.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%24 = torch_c.from_builtin_tensor %__auto.blk.1.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.1.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.1.kv_cache.quantizer:rscale" : tensor<f32>
%25 = torch_c.from_builtin_tensor %__auto.blk.1.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.1.attn_output.q_input3Arscale = util.global.load @"__auto.blk.1.attn_output.q_input:rscale" : tensor<f32>
%26 = torch_c.from_builtin_tensor %__auto.blk.1.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.1.attn_output.weight3Aqs = util.global.load @"__auto.blk.1.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%27 = torch_c.from_builtin_tensor %__auto.blk.1.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.1.ffn_norm.weight = util.global.load @__auto.blk.1.ffn_norm.weight : tensor<4096xbf16>
%28 = torch_c.from_builtin_tensor %__auto.blk.1.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.1.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.1.ffn_gate.q_input:rscale" : tensor<f32>
%29 = torch_c.from_builtin_tensor %__auto.blk.1.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.1.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.1.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%30 = torch_c.from_builtin_tensor %__auto.blk.1.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.1.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.1.ffn_up.q_input:rscale" : tensor<f32>
%31 = torch_c.from_builtin_tensor %__auto.blk.1.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.1.ffn_up.weight3Aqs = util.global.load @"__auto.blk.1.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%32 = torch_c.from_builtin_tensor %__auto.blk.1.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.1.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.1.ffn_down.q_input:rscale" : tensor<f32>
%33 = torch_c.from_builtin_tensor %__auto.blk.1.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.1.ffn_down.weight3Aqs = util.global.load @"__auto.blk.1.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
%34 = torch_c.from_builtin_tensor %__auto.blk.1.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
%__auto.blk.2.attn_norm.weight = util.global.load @__auto.blk.2.attn_norm.weight : tensor<4096xbf16>
%35 = torch_c.from_builtin_tensor %__auto.blk.2.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.2.attn_q.q_input3Arscale = util.global.load @"__auto.blk.2.attn_q.q_input:rscale" : tensor<f32>
%36 = torch_c.from_builtin_tensor %__auto.blk.2.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.2.attn_q.weight3Aqs = util.global.load @"__auto.blk.2.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%37 = torch_c.from_builtin_tensor %__auto.blk.2.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.2.attn_k.q_input3Arscale = util.global.load @"__auto.blk.2.attn_k.q_input:rscale" : tensor<f32>
%38 = torch_c.from_builtin_tensor %__auto.blk.2.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.2.attn_k.weight3Aqs = util.global.load @"__auto.blk.2.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%39 = torch_c.from_builtin_tensor %__auto.blk.2.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.2.attn_v.q_input3Arscale = util.global.load @"__auto.blk.2.attn_v.q_input:rscale" : tensor<f32>
%40 = torch_c.from_builtin_tensor %__auto.blk.2.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.2.attn_v.weight3Aqs = util.global.load @"__auto.blk.2.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%41 = torch_c.from_builtin_tensor %__auto.blk.2.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.2.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.2.kv_cache.quantizer:rscale" : tensor<f32>
%42 = torch_c.from_builtin_tensor %__auto.blk.2.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.2.attn_output.q_input3Arscale = util.global.load @"__auto.blk.2.attn_output.q_input:rscale" : tensor<f32>
%43 = torch_c.from_builtin_tensor %__auto.blk.2.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.2.attn_output.weight3Aqs = util.global.load @"__auto.blk.2.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%44 = torch_c.from_builtin_tensor %__auto.blk.2.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.2.ffn_norm.weight = util.global.load @__auto.blk.2.ffn_norm.weight : tensor<4096xbf16>
%45 = torch_c.from_builtin_tensor %__auto.blk.2.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.2.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.2.ffn_gate.q_input:rscale" : tensor<f32>
%46 = torch_c.from_builtin_tensor %__auto.blk.2.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.2.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.2.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%47 = torch_c.from_builtin_tensor %__auto.blk.2.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.2.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.2.ffn_up.q_input:rscale" : tensor<f32>
%48 = torch_c.from_builtin_tensor %__auto.blk.2.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.2.ffn_up.weight3Aqs = util.global.load @"__auto.blk.2.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%49 = torch_c.from_builtin_tensor %__auto.blk.2.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.2.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.2.ffn_down.q_input:rscale" : tensor<f32>
%50 = torch_c.from_builtin_tensor %__auto.blk.2.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.2.ffn_down.weight3Aqs = util.global.load @"__auto.blk.2.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
%51 = torch_c.from_builtin_tensor %__auto.blk.2.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
%__auto.blk.3.attn_norm.weight = util.global.load @__auto.blk.3.attn_norm.weight : tensor<4096xbf16>
%52 = torch_c.from_builtin_tensor %__auto.blk.3.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.3.attn_q.q_input3Arscale = util.global.load @"__auto.blk.3.attn_q.q_input:rscale" : tensor<f32>
%53 = torch_c.from_builtin_tensor %__auto.blk.3.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.3.attn_q.weight3Aqs = util.global.load @"__auto.blk.3.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%54 = torch_c.from_builtin_tensor %__auto.blk.3.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.3.attn_k.q_input3Arscale = util.global.load @"__auto.blk.3.attn_k.q_input:rscale" : tensor<f32>
%55 = torch_c.from_builtin_tensor %__auto.blk.3.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.3.attn_k.weight3Aqs = util.global.load @"__auto.blk.3.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%56 = torch_c.from_builtin_tensor %__auto.blk.3.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.3.attn_v.q_input3Arscale = util.global.load @"__auto.blk.3.attn_v.q_input:rscale" : tensor<f32>
%57 = torch_c.from_builtin_tensor %__auto.blk.3.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.3.attn_v.weight3Aqs = util.global.load @"__auto.blk.3.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%58 = torch_c.from_builtin_tensor %__auto.blk.3.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.3.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.3.kv_cache.quantizer:rscale" : tensor<f32>
%59 = torch_c.from_builtin_tensor %__auto.blk.3.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.3.attn_output.q_input3Arscale = util.global.load @"__auto.blk.3.attn_output.q_input:rscale" : tensor<f32>
%60 = torch_c.from_builtin_tensor %__auto.blk.3.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.3.attn_output.weight3Aqs = util.global.load @"__auto.blk.3.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%61 = torch_c.from_builtin_tensor %__auto.blk.3.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.3.ffn_norm.weight = util.global.load @__auto.blk.3.ffn_norm.weight : tensor<4096xbf16>
%62 = torch_c.from_builtin_tensor %__auto.blk.3.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.3.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.3.ffn_gate.q_input:rscale" : tensor<f32>
%63 = torch_c.from_builtin_tensor %__auto.blk.3.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.3.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.3.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%64 = torch_c.from_builtin_tensor %__auto.blk.3.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.3.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.3.ffn_up.q_input:rscale" : tensor<f32>
%65 = torch_c.from_builtin_tensor %__auto.blk.3.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.3.ffn_up.weight3Aqs = util.global.load @"__auto.blk.3.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%66 = torch_c.from_builtin_tensor %__auto.blk.3.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.3.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.3.ffn_down.q_input:rscale" : tensor<f32>
%67 = torch_c.from_builtin_tensor %__auto.blk.3.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.3.ffn_down.weight3Aqs = util.global.load @"__auto.blk.3.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
%68 = torch_c.from_builtin_tensor %__auto.blk.3.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
%__auto.blk.4.attn_norm.weight = util.global.load @__auto.blk.4.attn_norm.weight : tensor<4096xbf16>
%69 = torch_c.from_builtin_tensor %__auto.blk.4.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.4.attn_q.q_input3Arscale = util.global.load @"__auto.blk.4.attn_q.q_input:rscale" : tensor<f32>
%70 = torch_c.from_builtin_tensor %__auto.blk.4.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.4.attn_q.weight3Aqs = util.global.load @"__auto.blk.4.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%71 = torch_c.from_builtin_tensor %__auto.blk.4.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.4.attn_k.q_input3Arscale = util.global.load @"__auto.blk.4.attn_k.q_input:rscale" : tensor<f32>
%72 = torch_c.from_builtin_tensor %__auto.blk.4.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.4.attn_k.weight3Aqs = util.global.load @"__auto.blk.4.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%73 = torch_c.from_builtin_tensor %__auto.blk.4.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.4.attn_v.q_input3Arscale = util.global.load @"__auto.blk.4.attn_v.q_input:rscale" : tensor<f32>
%74 = torch_c.from_builtin_tensor %__auto.blk.4.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.4.attn_v.weight3Aqs = util.global.load @"__auto.blk.4.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%75 = torch_c.from_builtin_tensor %__auto.blk.4.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.4.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.4.kv_cache.quantizer:rscale" : tensor<f32>
%76 = torch_c.from_builtin_tensor %__auto.blk.4.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.4.attn_output.q_input3Arscale = util.global.load @"__auto.blk.4.attn_output.q_input:rscale" : tensor<f32>
%77 = torch_c.from_builtin_tensor %__auto.blk.4.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.4.attn_output.weight3Aqs = util.global.load @"__auto.blk.4.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%78 = torch_c.from_builtin_tensor %__auto.blk.4.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.4.ffn_norm.weight = util.global.load @__auto.blk.4.ffn_norm.weight : tensor<4096xbf16>
%79 = torch_c.from_builtin_tensor %__auto.blk.4.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.4.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.4.ffn_gate.q_input:rscale" : tensor<f32>
%80 = torch_c.from_builtin_tensor %__auto.blk.4.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.4.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.4.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%81 = torch_c.from_builtin_tensor %__auto.blk.4.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.4.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.4.ffn_up.q_input:rscale" : tensor<f32>
%82 = torch_c.from_builtin_tensor %__auto.blk.4.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.4.ffn_up.weight3Aqs = util.global.load @"__auto.blk.4.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%83 = torch_c.from_builtin_tensor %__auto.blk.4.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.4.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.4.ffn_down.q_input:rscale" : tensor<f32>
%84 = torch_c.from_builtin_tensor %__auto.blk.4.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.4.ffn_down.weight3Aqs = util.global.load @"__auto.blk.4.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
%85 = torch_c.from_builtin_tensor %__auto.blk.4.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
%__auto.blk.5.attn_norm.weight = util.global.load @__auto.blk.5.attn_norm.weight : tensor<4096xbf16>
%86 = torch_c.from_builtin_tensor %__auto.blk.5.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.5.attn_q.q_input3Arscale = util.global.load @"__auto.blk.5.attn_q.q_input:rscale" : tensor<f32>
%87 = torch_c.from_builtin_tensor %__auto.blk.5.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.5.attn_q.weight3Aqs = util.global.load @"__auto.blk.5.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%88 = torch_c.from_builtin_tensor %__auto.blk.5.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.5.attn_k.q_input3Arscale = util.global.load @"__auto.blk.5.attn_k.q_input:rscale" : tensor<f32>
%89 = torch_c.from_builtin_tensor %__auto.blk.5.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.5.attn_k.weight3Aqs = util.global.load @"__auto.blk.5.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%90 = torch_c.from_builtin_tensor %__auto.blk.5.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.5.attn_v.q_input3Arscale = util.global.load @"__auto.blk.5.attn_v.q_input:rscale" : tensor<f32>
%91 = torch_c.from_builtin_tensor %__auto.blk.5.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.5.attn_v.weight3Aqs = util.global.load @"__auto.blk.5.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%92 = torch_c.from_builtin_tensor %__auto.blk.5.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.5.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.5.kv_cache.quantizer:rscale" : tensor<f32>
%93 = torch_c.from_builtin_tensor %__auto.blk.5.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.5.attn_output.q_input3Arscale = util.global.load @"__auto.blk.5.attn_output.q_input:rscale" : tensor<f32>
%94 = torch_c.from_builtin_tensor %__auto.blk.5.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.5.attn_output.weight3Aqs = util.global.load @"__auto.blk.5.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%95 = torch_c.from_builtin_tensor %__auto.blk.5.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.5.ffn_norm.weight = util.global.load @__auto.blk.5.ffn_norm.weight : tensor<4096xbf16>
%96 = torch_c.from_builtin_tensor %__auto.blk.5.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.5.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.5.ffn_gate.q_input:rscale" : tensor<f32>
%97 = torch_c.from_builtin_tensor %__auto.blk.5.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.5.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.5.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%98 = torch_c.from_builtin_tensor %__auto.blk.5.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.5.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.5.ffn_up.q_input:rscale" : tensor<f32>
%99 = torch_c.from_builtin_tensor %__auto.blk.5.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.5.ffn_up.weight3Aqs = util.global.load @"__auto.blk.5.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%100 = torch_c.from_builtin_tensor %__auto.blk.5.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.5.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.5.ffn_down.q_input:rscale" : tensor<f32>
%101 = torch_c.from_builtin_tensor %__auto.blk.5.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.5.ffn_down.weight3Aqs = util.global.load @"__auto.blk.5.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
%102 = torch_c.from_builtin_tensor %__auto.blk.5.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
%__auto.blk.6.attn_norm.weight = util.global.load @__auto.blk.6.attn_norm.weight : tensor<4096xbf16>
%103 = torch_c.from_builtin_tensor %__auto.blk.6.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.6.attn_q.q_input3Arscale = util.global.load @"__auto.blk.6.attn_q.q_input:rscale" : tensor<f32>
%104 = torch_c.from_builtin_tensor %__auto.blk.6.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.6.attn_q.weight3Aqs = util.global.load @"__auto.blk.6.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%105 = torch_c.from_builtin_tensor %__auto.blk.6.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.6.attn_k.q_input3Arscale = util.global.load @"__auto.blk.6.attn_k.q_input:rscale" : tensor<f32>
%106 = torch_c.from_builtin_tensor %__auto.blk.6.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.6.attn_k.weight3Aqs = util.global.load @"__auto.blk.6.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%107 = torch_c.from_builtin_tensor %__auto.blk.6.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.6.attn_v.q_input3Arscale = util.global.load @"__auto.blk.6.attn_v.q_input:rscale" : tensor<f32>
%108 = torch_c.from_builtin_tensor %__auto.blk.6.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.6.attn_v.weight3Aqs = util.global.load @"__auto.blk.6.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%109 = torch_c.from_builtin_tensor %__auto.blk.6.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.6.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.6.kv_cache.quantizer:rscale" : tensor<f32>
%110 = torch_c.from_builtin_tensor %__auto.blk.6.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.6.attn_output.q_input3Arscale = util.global.load @"__auto.blk.6.attn_output.q_input:rscale" : tensor<f32>
%111 = torch_c.from_builtin_tensor %__auto.blk.6.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.6.attn_output.weight3Aqs = util.global.load @"__auto.blk.6.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%112 = torch_c.from_builtin_tensor %__auto.blk.6.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.6.ffn_norm.weight = util.global.load @__auto.blk.6.ffn_norm.weight : tensor<4096xbf16>
%113 = torch_c.from_builtin_tensor %__auto.blk.6.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.6.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.6.ffn_gate.q_input:rscale" : tensor<f32>
%114 = torch_c.from_builtin_tensor %__auto.blk.6.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.6.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.6.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%115 = torch_c.from_builtin_tensor %__auto.blk.6.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.6.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.6.ffn_up.q_input:rscale" : tensor<f32>
%116 = torch_c.from_builtin_tensor %__auto.blk.6.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.6.ffn_up.weight3Aqs = util.global.load @"__auto.blk.6.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%117 = torch_c.from_builtin_tensor %__auto.blk.6.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.6.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.6.ffn_down.q_input:rscale" : tensor<f32>
%118 = torch_c.from_builtin_tensor %__auto.blk.6.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.6.ffn_down.weight3Aqs = util.global.load @"__auto.blk.6.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
%119 = torch_c.from_builtin_tensor %__auto.blk.6.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
%__auto.blk.7.attn_norm.weight = util.global.load @__auto.blk.7.attn_norm.weight : tensor<4096xbf16>
%120 = torch_c.from_builtin_tensor %__auto.blk.7.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.7.attn_q.q_input3Arscale = util.global.load @"__auto.blk.7.attn_q.q_input:rscale" : tensor<f32>
%121 = torch_c.from_builtin_tensor %__auto.blk.7.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.7.attn_q.weight3Aqs = util.global.load @"__auto.blk.7.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%122 = torch_c.from_builtin_tensor %__auto.blk.7.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.7.attn_k.q_input3Arscale = util.global.load @"__auto.blk.7.attn_k.q_input:rscale" : tensor<f32>
%123 = torch_c.from_builtin_tensor %__auto.blk.7.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.7.attn_k.weight3Aqs = util.global.load @"__auto.blk.7.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%124 = torch_c.from_builtin_tensor %__auto.blk.7.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.7.attn_v.q_input3Arscale = util.global.load @"__auto.blk.7.attn_v.q_input:rscale" : tensor<f32>
%125 = torch_c.from_builtin_tensor %__auto.blk.7.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.7.attn_v.weight3Aqs = util.global.load @"__auto.blk.7.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%126 = torch_c.from_builtin_tensor %__auto.blk.7.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.7.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.7.kv_cache.quantizer:rscale" : tensor<f32>
%127 = torch_c.from_builtin_tensor %__auto.blk.7.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.7.attn_output.q_input3Arscale = util.global.load @"__auto.blk.7.attn_output.q_input:rscale" : tensor<f32>
%128 = torch_c.from_builtin_tensor %__auto.blk.7.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.7.attn_output.weight3Aqs = util.global.load @"__auto.blk.7.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%129 = torch_c.from_builtin_tensor %__auto.blk.7.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.7.ffn_norm.weight = util.global.load @__auto.blk.7.ffn_norm.weight : tensor<4096xbf16>
%130 = torch_c.from_builtin_tensor %__auto.blk.7.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.7.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.7.ffn_gate.q_input:rscale" : tensor<f32>
%131 = torch_c.from_builtin_tensor %__auto.blk.7.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.7.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.7.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%132 = torch_c.from_builtin_tensor %__auto.blk.7.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.7.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.7.ffn_up.q_input:rscale" : tensor<f32>
%133 = torch_c.from_builtin_tensor %__auto.blk.7.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.7.ffn_up.weight3Aqs = util.global.load @"__auto.blk.7.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%134 = torch_c.from_builtin_tensor %__auto.blk.7.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.7.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.7.ffn_down.q_input:rscale" : tensor<f32>
%135 = torch_c.from_builtin_tensor %__auto.blk.7.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.7.ffn_down.weight3Aqs = util.global.load @"__auto.blk.7.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
%136 = torch_c.from_builtin_tensor %__auto.blk.7.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
%__auto.blk.8.attn_norm.weight = util.global.load @__auto.blk.8.attn_norm.weight : tensor<4096xbf16>
%137 = torch_c.from_builtin_tensor %__auto.blk.8.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.8.attn_q.q_input3Arscale = util.global.load @"__auto.blk.8.attn_q.q_input:rscale" : tensor<f32>
%138 = torch_c.from_builtin_tensor %__auto.blk.8.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.8.attn_q.weight3Aqs = util.global.load @"__auto.blk.8.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%139 = torch_c.from_builtin_tensor %__auto.blk.8.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.8.attn_k.q_input3Arscale = util.global.load @"__auto.blk.8.attn_k.q_input:rscale" : tensor<f32>
%140 = torch_c.from_builtin_tensor %__auto.blk.8.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.8.attn_k.weight3Aqs = util.global.load @"__auto.blk.8.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%141 = torch_c.from_builtin_tensor %__auto.blk.8.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.8.attn_v.q_input3Arscale = util.global.load @"__auto.blk.8.attn_v.q_input:rscale" : tensor<f32>
%142 = torch_c.from_builtin_tensor %__auto.blk.8.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.8.attn_v.weight3Aqs = util.global.load @"__auto.blk.8.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%143 = torch_c.from_builtin_tensor %__auto.blk.8.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.8.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.8.kv_cache.quantizer:rscale" : tensor<f32>
%144 = torch_c.from_builtin_tensor %__auto.blk.8.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.8.attn_output.q_input3Arscale = util.global.load @"__auto.blk.8.attn_output.q_input:rscale" : tensor<f32>
%145 = torch_c.from_builtin_tensor %__auto.blk.8.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.8.attn_output.weight3Aqs = util.global.load @"__auto.blk.8.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%146 = torch_c.from_builtin_tensor %__auto.blk.8.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.8.ffn_norm.weight = util.global.load @__auto.blk.8.ffn_norm.weight : tensor<4096xbf16>
%147 = torch_c.from_builtin_tensor %__auto.blk.8.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.8.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.8.ffn_gate.q_input:rscale" : tensor<f32>
%148 = torch_c.from_builtin_tensor %__auto.blk.8.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.8.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.8.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%149 = torch_c.from_builtin_tensor %__auto.blk.8.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.8.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.8.ffn_up.q_input:rscale" : tensor<f32>
%150 = torch_c.from_builtin_tensor %__auto.blk.8.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.8.ffn_up.weight3Aqs = util.global.load @"__auto.blk.8.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%151 = torch_c.from_builtin_tensor %__auto.blk.8.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.8.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.8.ffn_down.q_input:rscale" : tensor<f32>
%152 = torch_c.from_builtin_tensor %__auto.blk.8.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.8.ffn_down.weight3Aqs = util.global.load @"__auto.blk.8.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
%153 = torch_c.from_builtin_tensor %__auto.blk.8.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
%__auto.blk.9.attn_norm.weight = util.global.load @__auto.blk.9.attn_norm.weight : tensor<4096xbf16>
%154 = torch_c.from_builtin_tensor %__auto.blk.9.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.9.attn_q.q_input3Arscale = util.global.load @"__auto.blk.9.attn_q.q_input:rscale" : tensor<f32>
%155 = torch_c.from_builtin_tensor %__auto.blk.9.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.9.attn_q.weight3Aqs = util.global.load @"__auto.blk.9.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%156 = torch_c.from_builtin_tensor %__auto.blk.9.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.9.attn_k.q_input3Arscale = util.global.load @"__auto.blk.9.attn_k.q_input:rscale" : tensor<f32>
%157 = torch_c.from_builtin_tensor %__auto.blk.9.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.9.attn_k.weight3Aqs = util.global.load @"__auto.blk.9.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%158 = torch_c.from_builtin_tensor %__auto.blk.9.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.9.attn_v.q_input3Arscale = util.global.load @"__auto.blk.9.attn_v.q_input:rscale" : tensor<f32>
%159 = torch_c.from_builtin_tensor %__auto.blk.9.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.9.attn_v.weight3Aqs = util.global.load @"__auto.blk.9.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%160 = torch_c.from_builtin_tensor %__auto.blk.9.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.9.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.9.kv_cache.quantizer:rscale" : tensor<f32>
%161 = torch_c.from_builtin_tensor %__auto.blk.9.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.9.attn_output.q_input3Arscale = util.global.load @"__auto.blk.9.attn_output.q_input:rscale" : tensor<f32>
%162 = torch_c.from_builtin_tensor %__auto.blk.9.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.9.attn_output.weight3Aqs = util.global.load @"__auto.blk.9.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%163 = torch_c.from_builtin_tensor %__auto.blk.9.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.9.ffn_norm.weight = util.global.load @__auto.blk.9.ffn_norm.weight : tensor<4096xbf16>
%164 = torch_c.from_builtin_tensor %__auto.blk.9.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.9.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.9.ffn_gate.q_input:rscale" : tensor<f32>
%165 = torch_c.from_builtin_tensor %__auto.blk.9.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.9.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.9.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%166 = torch_c.from_builtin_tensor %__auto.blk.9.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.9.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.9.ffn_up.q_input:rscale" : tensor<f32>
%167 = torch_c.from_builtin_tensor %__auto.blk.9.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.9.ffn_up.weight3Aqs = util.global.load @"__auto.blk.9.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%168 = torch_c.from_builtin_tensor %__auto.blk.9.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.9.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.9.ffn_down.q_input:rscale" : tensor<f32>
%169 = torch_c.from_builtin_tensor %__auto.blk.9.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.9.ffn_down.weight3Aqs = util.global.load @"__auto.blk.9.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
%170 = torch_c.from_builtin_tensor %__auto.blk.9.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
%__auto.blk.10.attn_norm.weight = util.global.load @__auto.blk.10.attn_norm.weight : tensor<4096xbf16>
%171 = torch_c.from_builtin_tensor %__auto.blk.10.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.10.attn_q.q_input3Arscale = util.global.load @"__auto.blk.10.attn_q.q_input:rscale" : tensor<f32>
%172 = torch_c.from_builtin_tensor %__auto.blk.10.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.10.attn_q.weight3Aqs = util.global.load @"__auto.blk.10.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%173 = torch_c.from_builtin_tensor %__auto.blk.10.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.10.attn_k.q_input3Arscale = util.global.load @"__auto.blk.10.attn_k.q_input:rscale" : tensor<f32>
%174 = torch_c.from_builtin_tensor %__auto.blk.10.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.10.attn_k.weight3Aqs = util.global.load @"__auto.blk.10.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%175 = torch_c.from_builtin_tensor %__auto.blk.10.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.10.attn_v.q_input3Arscale = util.global.load @"__auto.blk.10.attn_v.q_input:rscale" : tensor<f32>
%176 = torch_c.from_builtin_tensor %__auto.blk.10.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.10.attn_v.weight3Aqs = util.global.load @"__auto.blk.10.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%177 = torch_c.from_builtin_tensor %__auto.blk.10.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.10.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.10.kv_cache.quantizer:rscale" : tensor<f32>
%178 = torch_c.from_builtin_tensor %__auto.blk.10.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.10.attn_output.q_input3Arscale = util.global.load @"__auto.blk.10.attn_output.q_input:rscale" : tensor<f32>
%179 = torch_c.from_builtin_tensor %__auto.blk.10.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.10.attn_output.weight3Aqs = util.global.load @"__auto.blk.10.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%180 = torch_c.from_builtin_tensor %__auto.blk.10.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.10.ffn_norm.weight = util.global.load @__auto.blk.10.ffn_norm.weight : tensor<4096xbf16>
%181 = torch_c.from_builtin_tensor %__auto.blk.10.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.10.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.10.ffn_gate.q_input:rscale" : tensor<f32>
%182 = torch_c.from_builtin_tensor %__auto.blk.10.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.10.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.10.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%183 = torch_c.from_builtin_tensor %__auto.blk.10.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.10.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.10.ffn_up.q_input:rscale" : tensor<f32>
%184 = torch_c.from_builtin_tensor %__auto.blk.10.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.10.ffn_up.weight3Aqs = util.global.load @"__auto.blk.10.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%185 = torch_c.from_builtin_tensor %__auto.blk.10.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.10.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.10.ffn_down.q_input:rscale" : tensor<f32>
%186 = torch_c.from_builtin_tensor %__auto.blk.10.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.10.ffn_down.weight3Aqs = util.global.load @"__auto.blk.10.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
%187 = torch_c.from_builtin_tensor %__auto.blk.10.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
%__auto.blk.11.attn_norm.weight = util.global.load @__auto.blk.11.attn_norm.weight : tensor<4096xbf16>
%188 = torch_c.from_builtin_tensor %__auto.blk.11.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.11.attn_q.q_input3Arscale = util.global.load @"__auto.blk.11.attn_q.q_input:rscale" : tensor<f32>
%189 = torch_c.from_builtin_tensor %__auto.blk.11.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.11.attn_q.weight3Aqs = util.global.load @"__auto.blk.11.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%190 = torch_c.from_builtin_tensor %__auto.blk.11.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.11.attn_k.q_input3Arscale = util.global.load @"__auto.blk.11.attn_k.q_input:rscale" : tensor<f32>
%191 = torch_c.from_builtin_tensor %__auto.blk.11.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.11.attn_k.weight3Aqs = util.global.load @"__auto.blk.11.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%192 = torch_c.from_builtin_tensor %__auto.blk.11.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.11.attn_v.q_input3Arscale = util.global.load @"__auto.blk.11.attn_v.q_input:rscale" : tensor<f32>
%193 = torch_c.from_builtin_tensor %__auto.blk.11.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.11.attn_v.weight3Aqs = util.global.load @"__auto.blk.11.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%194 = torch_c.from_builtin_tensor %__auto.blk.11.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.11.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.11.kv_cache.quantizer:rscale" : tensor<f32>
%195 = torch_c.from_builtin_tensor %__auto.blk.11.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.11.attn_output.q_input3Arscale = util.global.load @"__auto.blk.11.attn_output.q_input:rscale" : tensor<f32>
%196 = torch_c.from_builtin_tensor %__auto.blk.11.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.11.attn_output.weight3Aqs = util.global.load @"__auto.blk.11.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%197 = torch_c.from_builtin_tensor %__auto.blk.11.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.11.ffn_norm.weight = util.global.load @__auto.blk.11.ffn_norm.weight : tensor<4096xbf16>
%198 = torch_c.from_builtin_tensor %__auto.blk.11.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.11.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.11.ffn_gate.q_input:rscale" : tensor<f32>
%199 = torch_c.from_builtin_tensor %__auto.blk.11.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.11.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.11.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%200 = torch_c.from_builtin_tensor %__auto.blk.11.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.11.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.11.ffn_up.q_input:rscale" : tensor<f32>
%201 = torch_c.from_builtin_tensor %__auto.blk.11.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.11.ffn_up.weight3Aqs = util.global.load @"__auto.blk.11.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%202 = torch_c.from_builtin_tensor %__auto.blk.11.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.11.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.11.ffn_down.q_input:rscale" : tensor<f32>
%203 = torch_c.from_builtin_tensor %__auto.blk.11.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.11.ffn_down.weight3Aqs = util.global.load @"__auto.blk.11.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
%204 = torch_c.from_builtin_tensor %__auto.blk.11.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
%__auto.blk.12.attn_norm.weight = util.global.load @__auto.blk.12.attn_norm.weight : tensor<4096xbf16>
%205 = torch_c.from_builtin_tensor %__auto.blk.12.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.12.attn_q.q_input3Arscale = util.global.load @"__auto.blk.12.attn_q.q_input:rscale" : tensor<f32>
%206 = torch_c.from_builtin_tensor %__auto.blk.12.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.12.attn_q.weight3Aqs = util.global.load @"__auto.blk.12.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%207 = torch_c.from_builtin_tensor %__auto.blk.12.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.12.attn_k.q_input3Arscale = util.global.load @"__auto.blk.12.attn_k.q_input:rscale" : tensor<f32>
%208 = torch_c.from_builtin_tensor %__auto.blk.12.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.12.attn_k.weight3Aqs = util.global.load @"__auto.blk.12.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%209 = torch_c.from_builtin_tensor %__auto.blk.12.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.12.attn_v.q_input3Arscale = util.global.load @"__auto.blk.12.attn_v.q_input:rscale" : tensor<f32>
%210 = torch_c.from_builtin_tensor %__auto.blk.12.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.12.attn_v.weight3Aqs = util.global.load @"__auto.blk.12.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%211 = torch_c.from_builtin_tensor %__auto.blk.12.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.12.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.12.kv_cache.quantizer:rscale" : tensor<f32>
%212 = torch_c.from_builtin_tensor %__auto.blk.12.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.12.attn_output.q_input3Arscale = util.global.load @"__auto.blk.12.attn_output.q_input:rscale" : tensor<f32>
%213 = torch_c.from_builtin_tensor %__auto.blk.12.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.12.attn_output.weight3Aqs = util.global.load @"__auto.blk.12.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%214 = torch_c.from_builtin_tensor %__auto.blk.12.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.12.ffn_norm.weight = util.global.load @__auto.blk.12.ffn_norm.weight : tensor<4096xbf16>
%215 = torch_c.from_builtin_tensor %__auto.blk.12.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.12.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.12.ffn_gate.q_input:rscale" : tensor<f32>
%216 = torch_c.from_builtin_tensor %__auto.blk.12.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.12.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.12.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%217 = torch_c.from_builtin_tensor %__auto.blk.12.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.12.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.12.ffn_up.q_input:rscale" : tensor<f32>
%218 = torch_c.from_builtin_tensor %__auto.blk.12.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.12.ffn_up.weight3Aqs = util.global.load @"__auto.blk.12.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%219 = torch_c.from_builtin_tensor %__auto.blk.12.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.12.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.12.ffn_down.q_input:rscale" : tensor<f32>
%220 = torch_c.from_builtin_tensor %__auto.blk.12.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.12.ffn_down.weight3Aqs = util.global.load @"__auto.blk.12.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
%221 = torch_c.from_builtin_tensor %__auto.blk.12.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
%__auto.blk.13.attn_norm.weight = util.global.load @__auto.blk.13.attn_norm.weight : tensor<4096xbf16>
%222 = torch_c.from_builtin_tensor %__auto.blk.13.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.13.attn_q.q_input3Arscale = util.global.load @"__auto.blk.13.attn_q.q_input:rscale" : tensor<f32>
%223 = torch_c.from_builtin_tensor %__auto.blk.13.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.13.attn_q.weight3Aqs = util.global.load @"__auto.blk.13.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%224 = torch_c.from_builtin_tensor %__auto.blk.13.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.13.attn_k.q_input3Arscale = util.global.load @"__auto.blk.13.attn_k.q_input:rscale" : tensor<f32>
%225 = torch_c.from_builtin_tensor %__auto.blk.13.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.13.attn_k.weight3Aqs = util.global.load @"__auto.blk.13.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%226 = torch_c.from_builtin_tensor %__auto.blk.13.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.13.attn_v.q_input3Arscale = util.global.load @"__auto.blk.13.attn_v.q_input:rscale" : tensor<f32>
%227 = torch_c.from_builtin_tensor %__auto.blk.13.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.13.attn_v.weight3Aqs = util.global.load @"__auto.blk.13.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%228 = torch_c.from_builtin_tensor %__auto.blk.13.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.13.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.13.kv_cache.quantizer:rscale" : tensor<f32>
%229 = torch_c.from_builtin_tensor %__auto.blk.13.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.13.attn_output.q_input3Arscale = util.global.load @"__auto.blk.13.attn_output.q_input:rscale" : tensor<f32>
%230 = torch_c.from_builtin_tensor %__auto.blk.13.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.13.attn_output.weight3Aqs = util.global.load @"__auto.blk.13.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%231 = torch_c.from_builtin_tensor %__auto.blk.13.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.13.ffn_norm.weight = util.global.load @__auto.blk.13.ffn_norm.weight : tensor<4096xbf16>
%232 = torch_c.from_builtin_tensor %__auto.blk.13.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.13.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.13.ffn_gate.q_input:rscale" : tensor<f32>
%233 = torch_c.from_builtin_tensor %__auto.blk.13.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.13.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.13.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%234 = torch_c.from_builtin_tensor %__auto.blk.13.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.13.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.13.ffn_up.q_input:rscale" : tensor<f32>
%235 = torch_c.from_builtin_tensor %__auto.blk.13.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.13.ffn_up.weight3Aqs = util.global.load @"__auto.blk.13.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%236 = torch_c.from_builtin_tensor %__auto.blk.13.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.13.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.13.ffn_down.q_input:rscale" : tensor<f32>
%237 = torch_c.from_builtin_tensor %__auto.blk.13.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.13.ffn_down.weight3Aqs = util.global.load @"__auto.blk.13.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
%238 = torch_c.from_builtin_tensor %__auto.blk.13.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
%__auto.blk.14.attn_norm.weight = util.global.load @__auto.blk.14.attn_norm.weight : tensor<4096xbf16>
%239 = torch_c.from_builtin_tensor %__auto.blk.14.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.14.attn_q.q_input3Arscale = util.global.load @"__auto.blk.14.attn_q.q_input:rscale" : tensor<f32>
%240 = torch_c.from_builtin_tensor %__auto.blk.14.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.14.attn_q.weight3Aqs = util.global.load @"__auto.blk.14.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%241 = torch_c.from_builtin_tensor %__auto.blk.14.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.14.attn_k.q_input3Arscale = util.global.load @"__auto.blk.14.attn_k.q_input:rscale" : tensor<f32>
%242 = torch_c.from_builtin_tensor %__auto.blk.14.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.14.attn_k.weight3Aqs = util.global.load @"__auto.blk.14.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%243 = torch_c.from_builtin_tensor %__auto.blk.14.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.14.attn_v.q_input3Arscale = util.global.load @"__auto.blk.14.attn_v.q_input:rscale" : tensor<f32>
%244 = torch_c.from_builtin_tensor %__auto.blk.14.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.14.attn_v.weight3Aqs = util.global.load @"__auto.blk.14.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%245 = torch_c.from_builtin_tensor %__auto.blk.14.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.14.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.14.kv_cache.quantizer:rscale" : tensor<f32>
%246 = torch_c.from_builtin_tensor %__auto.blk.14.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.14.attn_output.q_input3Arscale = util.global.load @"__auto.blk.14.attn_output.q_input:rscale" : tensor<f32>
%247 = torch_c.from_builtin_tensor %__auto.blk.14.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.14.attn_output.weight3Aqs = util.global.load @"__auto.blk.14.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%248 = torch_c.from_builtin_tensor %__auto.blk.14.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.14.ffn_norm.weight = util.global.load @__auto.blk.14.ffn_norm.weight : tensor<4096xbf16>
%249 = torch_c.from_builtin_tensor %__auto.blk.14.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.14.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.14.ffn_gate.q_input:rscale" : tensor<f32>
%250 = torch_c.from_builtin_tensor %__auto.blk.14.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.14.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.14.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%251 = torch_c.from_builtin_tensor %__auto.blk.14.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.14.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.14.ffn_up.q_input:rscale" : tensor<f32>
%252 = torch_c.from_builtin_tensor %__auto.blk.14.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.14.ffn_up.weight3Aqs = util.global.load @"__auto.blk.14.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%253 = torch_c.from_builtin_tensor %__auto.blk.14.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.14.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.14.ffn_down.q_input:rscale" : tensor<f32>
%254 = torch_c.from_builtin_tensor %__auto.blk.14.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.14.ffn_down.weight3Aqs = util.global.load @"__auto.blk.14.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
%255 = torch_c.from_builtin_tensor %__auto.blk.14.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
%__auto.blk.15.attn_norm.weight = util.global.load @__auto.blk.15.attn_norm.weight : tensor<4096xbf16>
%256 = torch_c.from_builtin_tensor %__auto.blk.15.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.15.attn_q.q_input3Arscale = util.global.load @"__auto.blk.15.attn_q.q_input:rscale" : tensor<f32>
%257 = torch_c.from_builtin_tensor %__auto.blk.15.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.15.attn_q.weight3Aqs = util.global.load @"__auto.blk.15.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%258 = torch_c.from_builtin_tensor %__auto.blk.15.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.15.attn_k.q_input3Arscale = util.global.load @"__auto.blk.15.attn_k.q_input:rscale" : tensor<f32>
%259 = torch_c.from_builtin_tensor %__auto.blk.15.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.15.attn_k.weight3Aqs = util.global.load @"__auto.blk.15.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%260 = torch_c.from_builtin_tensor %__auto.blk.15.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.15.attn_v.q_input3Arscale = util.global.load @"__auto.blk.15.attn_v.q_input:rscale" : tensor<f32>
%261 = torch_c.from_builtin_tensor %__auto.blk.15.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.15.attn_v.weight3Aqs = util.global.load @"__auto.blk.15.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%262 = torch_c.from_builtin_tensor %__auto.blk.15.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.15.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.15.kv_cache.quantizer:rscale" : tensor<f32>
%263 = torch_c.from_builtin_tensor %__auto.blk.15.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.15.attn_output.q_input3Arscale = util.global.load @"__auto.blk.15.attn_output.q_input:rscale" : tensor<f32>
%264 = torch_c.from_builtin_tensor %__auto.blk.15.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.15.attn_output.weight3Aqs = util.global.load @"__auto.blk.15.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%265 = torch_c.from_builtin_tensor %__auto.blk.15.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.15.ffn_norm.weight = util.global.load @__auto.blk.15.ffn_norm.weight : tensor<4096xbf16>
%266 = torch_c.from_builtin_tensor %__auto.blk.15.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.15.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.15.ffn_gate.q_input:rscale" : tensor<f32>
%267 = torch_c.from_builtin_tensor %__auto.blk.15.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.15.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.15.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%268 = torch_c.from_builtin_tensor %__auto.blk.15.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.15.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.15.ffn_up.q_input:rscale" : tensor<f32>
%269 = torch_c.from_builtin_tensor %__auto.blk.15.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.15.ffn_up.weight3Aqs = util.global.load @"__auto.blk.15.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%270 = torch_c.from_builtin_tensor %__auto.blk.15.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.15.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.15.ffn_down.q_input:rscale" : tensor<f32>
%271 = torch_c.from_builtin_tensor %__auto.blk.15.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.15.ffn_down.weight3Aqs = util.global.load @"__auto.blk.15.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
%272 = torch_c.from_builtin_tensor %__auto.blk.15.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
%__auto.blk.16.attn_norm.weight = util.global.load @__auto.blk.16.attn_norm.weight : tensor<4096xbf16>
%273 = torch_c.from_builtin_tensor %__auto.blk.16.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.16.attn_q.q_input3Arscale = util.global.load @"__auto.blk.16.attn_q.q_input:rscale" : tensor<f32>
%274 = torch_c.from_builtin_tensor %__auto.blk.16.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.16.attn_q.weight3Aqs = util.global.load @"__auto.blk.16.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%275 = torch_c.from_builtin_tensor %__auto.blk.16.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.16.attn_k.q_input3Arscale = util.global.load @"__auto.blk.16.attn_k.q_input:rscale" : tensor<f32>
%276 = torch_c.from_builtin_tensor %__auto.blk.16.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.16.attn_k.weight3Aqs = util.global.load @"__auto.blk.16.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%277 = torch_c.from_builtin_tensor %__auto.blk.16.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.16.attn_v.q_input3Arscale = util.global.load @"__auto.blk.16.attn_v.q_input:rscale" : tensor<f32>
%278 = torch_c.from_builtin_tensor %__auto.blk.16.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.16.attn_v.weight3Aqs = util.global.load @"__auto.blk.16.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%279 = torch_c.from_builtin_tensor %__auto.blk.16.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.16.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.16.kv_cache.quantizer:rscale" : tensor<f32>
%280 = torch_c.from_builtin_tensor %__auto.blk.16.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.16.attn_output.q_input3Arscale = util.global.load @"__auto.blk.16.attn_output.q_input:rscale" : tensor<f32>
%281 = torch_c.from_builtin_tensor %__auto.blk.16.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.16.attn_output.weight3Aqs = util.global.load @"__auto.blk.16.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%282 = torch_c.from_builtin_tensor %__auto.blk.16.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.16.ffn_norm.weight = util.global.load @__auto.blk.16.ffn_norm.weight : tensor<4096xbf16>
%283 = torch_c.from_builtin_tensor %__auto.blk.16.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.16.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.16.ffn_gate.q_input:rscale" : tensor<f32>
%284 = torch_c.from_builtin_tensor %__auto.blk.16.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.16.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.16.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%285 = torch_c.from_builtin_tensor %__auto.blk.16.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.16.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.16.ffn_up.q_input:rscale" : tensor<f32>
%286 = torch_c.from_builtin_tensor %__auto.blk.16.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.16.ffn_up.weight3Aqs = util.global.load @"__auto.blk.16.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%287 = torch_c.from_builtin_tensor %__auto.blk.16.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.16.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.16.ffn_down.q_input:rscale" : tensor<f32>
%288 = torch_c.from_builtin_tensor %__auto.blk.16.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.16.ffn_down.weight3Aqs = util.global.load @"__auto.blk.16.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
%289 = torch_c.from_builtin_tensor %__auto.blk.16.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
%__auto.blk.17.attn_norm.weight = util.global.load @__auto.blk.17.attn_norm.weight : tensor<4096xbf16>
%290 = torch_c.from_builtin_tensor %__auto.blk.17.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.17.attn_q.q_input3Arscale = util.global.load @"__auto.blk.17.attn_q.q_input:rscale" : tensor<f32>
%291 = torch_c.from_builtin_tensor %__auto.blk.17.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.17.attn_q.weight3Aqs = util.global.load @"__auto.blk.17.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%292 = torch_c.from_builtin_tensor %__auto.blk.17.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.17.attn_k.q_input3Arscale = util.global.load @"__auto.blk.17.attn_k.q_input:rscale" : tensor<f32>
%293 = torch_c.from_builtin_tensor %__auto.blk.17.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.17.attn_k.weight3Aqs = util.global.load @"__auto.blk.17.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%294 = torch_c.from_builtin_tensor %__auto.blk.17.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.17.attn_v.q_input3Arscale = util.global.load @"__auto.blk.17.attn_v.q_input:rscale" : tensor<f32>
%295 = torch_c.from_builtin_tensor %__auto.blk.17.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.17.attn_v.weight3Aqs = util.global.load @"__auto.blk.17.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%296 = torch_c.from_builtin_tensor %__auto.blk.17.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.17.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.17.kv_cache.quantizer:rscale" : tensor<f32>
%297 = torch_c.from_builtin_tensor %__auto.blk.17.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.17.attn_output.q_input3Arscale = util.global.load @"__auto.blk.17.attn_output.q_input:rscale" : tensor<f32>
%298 = torch_c.from_builtin_tensor %__auto.blk.17.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.17.attn_output.weight3Aqs = util.global.load @"__auto.blk.17.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%299 = torch_c.from_builtin_tensor %__auto.blk.17.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.17.ffn_norm.weight = util.global.load @__auto.blk.17.ffn_norm.weight : tensor<4096xbf16>
%300 = torch_c.from_builtin_tensor %__auto.blk.17.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.17.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.17.ffn_gate.q_input:rscale" : tensor<f32>
%301 = torch_c.from_builtin_tensor %__auto.blk.17.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.17.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.17.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%302 = torch_c.from_builtin_tensor %__auto.blk.17.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.17.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.17.ffn_up.q_input:rscale" : tensor<f32>
%303 = torch_c.from_builtin_tensor %__auto.blk.17.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.17.ffn_up.weight3Aqs = util.global.load @"__auto.blk.17.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%304 = torch_c.from_builtin_tensor %__auto.blk.17.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.17.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.17.ffn_down.q_input:rscale" : tensor<f32>
%305 = torch_c.from_builtin_tensor %__auto.blk.17.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.17.ffn_down.weight3Aqs = util.global.load @"__auto.blk.17.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
%306 = torch_c.from_builtin_tensor %__auto.blk.17.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
%__auto.blk.18.attn_norm.weight = util.global.load @__auto.blk.18.attn_norm.weight : tensor<4096xbf16>
%307 = torch_c.from_builtin_tensor %__auto.blk.18.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.18.attn_q.q_input3Arscale = util.global.load @"__auto.blk.18.attn_q.q_input:rscale" : tensor<f32>
%308 = torch_c.from_builtin_tensor %__auto.blk.18.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.18.attn_q.weight3Aqs = util.global.load @"__auto.blk.18.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%309 = torch_c.from_builtin_tensor %__auto.blk.18.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.18.attn_k.q_input3Arscale = util.global.load @"__auto.blk.18.attn_k.q_input:rscale" : tensor<f32>
%310 = torch_c.from_builtin_tensor %__auto.blk.18.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.18.attn_k.weight3Aqs = util.global.load @"__auto.blk.18.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%311 = torch_c.from_builtin_tensor %__auto.blk.18.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.18.attn_v.q_input3Arscale = util.global.load @"__auto.blk.18.attn_v.q_input:rscale" : tensor<f32>
%312 = torch_c.from_builtin_tensor %__auto.blk.18.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.18.attn_v.weight3Aqs = util.global.load @"__auto.blk.18.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%313 = torch_c.from_builtin_tensor %__auto.blk.18.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.18.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.18.kv_cache.quantizer:rscale" : tensor<f32>
%314 = torch_c.from_builtin_tensor %__auto.blk.18.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.18.attn_output.q_input3Arscale = util.global.load @"__auto.blk.18.attn_output.q_input:rscale" : tensor<f32>
%315 = torch_c.from_builtin_tensor %__auto.blk.18.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.18.attn_output.weight3Aqs = util.global.load @"__auto.blk.18.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%316 = torch_c.from_builtin_tensor %__auto.blk.18.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.18.ffn_norm.weight = util.global.load @__auto.blk.18.ffn_norm.weight : tensor<4096xbf16>
%317 = torch_c.from_builtin_tensor %__auto.blk.18.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.18.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.18.ffn_gate.q_input:rscale" : tensor<f32>
%318 = torch_c.from_builtin_tensor %__auto.blk.18.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.18.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.18.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%319 = torch_c.from_builtin_tensor %__auto.blk.18.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.18.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.18.ffn_up.q_input:rscale" : tensor<f32>
%320 = torch_c.from_builtin_tensor %__auto.blk.18.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.18.ffn_up.weight3Aqs = util.global.load @"__auto.blk.18.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%321 = torch_c.from_builtin_tensor %__auto.blk.18.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.18.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.18.ffn_down.q_input:rscale" : tensor<f32>
%322 = torch_c.from_builtin_tensor %__auto.blk.18.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.18.ffn_down.weight3Aqs = util.global.load @"__auto.blk.18.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
%323 = torch_c.from_builtin_tensor %__auto.blk.18.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
%__auto.blk.19.attn_norm.weight = util.global.load @__auto.blk.19.attn_norm.weight : tensor<4096xbf16>
%324 = torch_c.from_builtin_tensor %__auto.blk.19.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.19.attn_q.q_input3Arscale = util.global.load @"__auto.blk.19.attn_q.q_input:rscale" : tensor<f32>
%325 = torch_c.from_builtin_tensor %__auto.blk.19.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.19.attn_q.weight3Aqs = util.global.load @"__auto.blk.19.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%326 = torch_c.from_builtin_tensor %__auto.blk.19.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.19.attn_k.q_input3Arscale = util.global.load @"__auto.blk.19.attn_k.q_input:rscale" : tensor<f32>
%327 = torch_c.from_builtin_tensor %__auto.blk.19.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.19.attn_k.weight3Aqs = util.global.load @"__auto.blk.19.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%328 = torch_c.from_builtin_tensor %__auto.blk.19.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.19.attn_v.q_input3Arscale = util.global.load @"__auto.blk.19.attn_v.q_input:rscale" : tensor<f32>
%329 = torch_c.from_builtin_tensor %__auto.blk.19.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.19.attn_v.weight3Aqs = util.global.load @"__auto.blk.19.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%330 = torch_c.from_builtin_tensor %__auto.blk.19.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.19.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.19.kv_cache.quantizer:rscale" : tensor<f32>
%331 = torch_c.from_builtin_tensor %__auto.blk.19.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.19.attn_output.q_input3Arscale = util.global.load @"__auto.blk.19.attn_output.q_input:rscale" : tensor<f32>
%332 = torch_c.from_builtin_tensor %__auto.blk.19.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.19.attn_output.weight3Aqs = util.global.load @"__auto.blk.19.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%333 = torch_c.from_builtin_tensor %__auto.blk.19.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.19.ffn_norm.weight = util.global.load @__auto.blk.19.ffn_norm.weight : tensor<4096xbf16>
%334 = torch_c.from_builtin_tensor %__auto.blk.19.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.19.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.19.ffn_gate.q_input:rscale" : tensor<f32>
%335 = torch_c.from_builtin_tensor %__auto.blk.19.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.19.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.19.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%336 = torch_c.from_builtin_tensor %__auto.blk.19.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.19.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.19.ffn_up.q_input:rscale" : tensor<f32>
%337 = torch_c.from_builtin_tensor %__auto.blk.19.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.19.ffn_up.weight3Aqs = util.global.load @"__auto.blk.19.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%338 = torch_c.from_builtin_tensor %__auto.blk.19.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.19.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.19.ffn_down.q_input:rscale" : tensor<f32>
%339 = torch_c.from_builtin_tensor %__auto.blk.19.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.19.ffn_down.weight3Aqs = util.global.load @"__auto.blk.19.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
%340 = torch_c.from_builtin_tensor %__auto.blk.19.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
%__auto.blk.20.attn_norm.weight = util.global.load @__auto.blk.20.attn_norm.weight : tensor<4096xbf16>
%341 = torch_c.from_builtin_tensor %__auto.blk.20.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.20.attn_q.q_input3Arscale = util.global.load @"__auto.blk.20.attn_q.q_input:rscale" : tensor<f32>
%342 = torch_c.from_builtin_tensor %__auto.blk.20.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.20.attn_q.weight3Aqs = util.global.load @"__auto.blk.20.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%343 = torch_c.from_builtin_tensor %__auto.blk.20.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.20.attn_k.q_input3Arscale = util.global.load @"__auto.blk.20.attn_k.q_input:rscale" : tensor<f32>
%344 = torch_c.from_builtin_tensor %__auto.blk.20.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.20.attn_k.weight3Aqs = util.global.load @"__auto.blk.20.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%345 = torch_c.from_builtin_tensor %__auto.blk.20.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.20.attn_v.q_input3Arscale = util.global.load @"__auto.blk.20.attn_v.q_input:rscale" : tensor<f32>
%346 = torch_c.from_builtin_tensor %__auto.blk.20.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.20.attn_v.weight3Aqs = util.global.load @"__auto.blk.20.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%347 = torch_c.from_builtin_tensor %__auto.blk.20.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.20.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.20.kv_cache.quantizer:rscale" : tensor<f32>
%348 = torch_c.from_builtin_tensor %__auto.blk.20.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.20.attn_output.q_input3Arscale = util.global.load @"__auto.blk.20.attn_output.q_input:rscale" : tensor<f32>
%349 = torch_c.from_builtin_tensor %__auto.blk.20.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.20.attn_output.weight3Aqs = util.global.load @"__auto.blk.20.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%350 = torch_c.from_builtin_tensor %__auto.blk.20.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.20.ffn_norm.weight = util.global.load @__auto.blk.20.ffn_norm.weight : tensor<4096xbf16>
%351 = torch_c.from_builtin_tensor %__auto.blk.20.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.20.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.20.ffn_gate.q_input:rscale" : tensor<f32>
%352 = torch_c.from_builtin_tensor %__auto.blk.20.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.20.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.20.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%353 = torch_c.from_builtin_tensor %__auto.blk.20.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.20.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.20.ffn_up.q_input:rscale" : tensor<f32>
%354 = torch_c.from_builtin_tensor %__auto.blk.20.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.20.ffn_up.weight3Aqs = util.global.load @"__auto.blk.20.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%355 = torch_c.from_builtin_tensor %__auto.blk.20.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.20.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.20.ffn_down.q_input:rscale" : tensor<f32>
%356 = torch_c.from_builtin_tensor %__auto.blk.20.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.20.ffn_down.weight3Aqs = util.global.load @"__auto.blk.20.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
%357 = torch_c.from_builtin_tensor %__auto.blk.20.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
%__auto.blk.21.attn_norm.weight = util.global.load @__auto.blk.21.attn_norm.weight : tensor<4096xbf16>
%358 = torch_c.from_builtin_tensor %__auto.blk.21.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.21.attn_q.q_input3Arscale = util.global.load @"__auto.blk.21.attn_q.q_input:rscale" : tensor<f32>
%359 = torch_c.from_builtin_tensor %__auto.blk.21.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.21.attn_q.weight3Aqs = util.global.load @"__auto.blk.21.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%360 = torch_c.from_builtin_tensor %__auto.blk.21.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.21.attn_k.q_input3Arscale = util.global.load @"__auto.blk.21.attn_k.q_input:rscale" : tensor<f32>
%361 = torch_c.from_builtin_tensor %__auto.blk.21.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.21.attn_k.weight3Aqs = util.global.load @"__auto.blk.21.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%362 = torch_c.from_builtin_tensor %__auto.blk.21.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.21.attn_v.q_input3Arscale = util.global.load @"__auto.blk.21.attn_v.q_input:rscale" : tensor<f32>
%363 = torch_c.from_builtin_tensor %__auto.blk.21.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.21.attn_v.weight3Aqs = util.global.load @"__auto.blk.21.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%364 = torch_c.from_builtin_tensor %__auto.blk.21.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.21.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.21.kv_cache.quantizer:rscale" : tensor<f32>
%365 = torch_c.from_builtin_tensor %__auto.blk.21.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.21.attn_output.q_input3Arscale = util.global.load @"__auto.blk.21.attn_output.q_input:rscale" : tensor<f32>
%366 = torch_c.from_builtin_tensor %__auto.blk.21.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.21.attn_output.weight3Aqs = util.global.load @"__auto.blk.21.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%367 = torch_c.from_builtin_tensor %__auto.blk.21.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.21.ffn_norm.weight = util.global.load @__auto.blk.21.ffn_norm.weight : tensor<4096xbf16>
%368 = torch_c.from_builtin_tensor %__auto.blk.21.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.21.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.21.ffn_gate.q_input:rscale" : tensor<f32>
%369 = torch_c.from_builtin_tensor %__auto.blk.21.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.21.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.21.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%370 = torch_c.from_builtin_tensor %__auto.blk.21.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.21.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.21.ffn_up.q_input:rscale" : tensor<f32>
%371 = torch_c.from_builtin_tensor %__auto.blk.21.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.21.ffn_up.weight3Aqs = util.global.load @"__auto.blk.21.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%372 = torch_c.from_builtin_tensor %__auto.blk.21.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.21.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.21.ffn_down.q_input:rscale" : tensor<f32>
%373 = torch_c.from_builtin_tensor %__auto.blk.21.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.21.ffn_down.weight3Aqs = util.global.load @"__auto.blk.21.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
%374 = torch_c.from_builtin_tensor %__auto.blk.21.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
%__auto.blk.22.attn_norm.weight = util.global.load @__auto.blk.22.attn_norm.weight : tensor<4096xbf16>
%375 = torch_c.from_builtin_tensor %__auto.blk.22.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.22.attn_q.q_input3Arscale = util.global.load @"__auto.blk.22.attn_q.q_input:rscale" : tensor<f32>
%376 = torch_c.from_builtin_tensor %__auto.blk.22.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.22.attn_q.weight3Aqs = util.global.load @"__auto.blk.22.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%377 = torch_c.from_builtin_tensor %__auto.blk.22.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.22.attn_k.q_input3Arscale = util.global.load @"__auto.blk.22.attn_k.q_input:rscale" : tensor<f32>
%378 = torch_c.from_builtin_tensor %__auto.blk.22.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.22.attn_k.weight3Aqs = util.global.load @"__auto.blk.22.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%379 = torch_c.from_builtin_tensor %__auto.blk.22.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.22.attn_v.q_input3Arscale = util.global.load @"__auto.blk.22.attn_v.q_input:rscale" : tensor<f32>
%380 = torch_c.from_builtin_tensor %__auto.blk.22.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.22.attn_v.weight3Aqs = util.global.load @"__auto.blk.22.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%381 = torch_c.from_builtin_tensor %__auto.blk.22.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.22.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.22.kv_cache.quantizer:rscale" : tensor<f32>
%382 = torch_c.from_builtin_tensor %__auto.blk.22.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.22.attn_output.q_input3Arscale = util.global.load @"__auto.blk.22.attn_output.q_input:rscale" : tensor<f32>
%383 = torch_c.from_builtin_tensor %__auto.blk.22.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.22.attn_output.weight3Aqs = util.global.load @"__auto.blk.22.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%384 = torch_c.from_builtin_tensor %__auto.blk.22.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.22.ffn_norm.weight = util.global.load @__auto.blk.22.ffn_norm.weight : tensor<4096xbf16>
%385 = torch_c.from_builtin_tensor %__auto.blk.22.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.22.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.22.ffn_gate.q_input:rscale" : tensor<f32>
%386 = torch_c.from_builtin_tensor %__auto.blk.22.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.22.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.22.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%387 = torch_c.from_builtin_tensor %__auto.blk.22.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.22.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.22.ffn_up.q_input:rscale" : tensor<f32>
%388 = torch_c.from_builtin_tensor %__auto.blk.22.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.22.ffn_up.weight3Aqs = util.global.load @"__auto.blk.22.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%389 = torch_c.from_builtin_tensor %__auto.blk.22.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.22.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.22.ffn_down.q_input:rscale" : tensor<f32>
%390 = torch_c.from_builtin_tensor %__auto.blk.22.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.22.ffn_down.weight3Aqs = util.global.load @"__auto.blk.22.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
%391 = torch_c.from_builtin_tensor %__auto.blk.22.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
%__auto.blk.23.attn_norm.weight = util.global.load @__auto.blk.23.attn_norm.weight : tensor<4096xbf16>
%392 = torch_c.from_builtin_tensor %__auto.blk.23.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.23.attn_q.q_input3Arscale = util.global.load @"__auto.blk.23.attn_q.q_input:rscale" : tensor<f32>
%393 = torch_c.from_builtin_tensor %__auto.blk.23.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.23.attn_q.weight3Aqs = util.global.load @"__auto.blk.23.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%394 = torch_c.from_builtin_tensor %__auto.blk.23.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.23.attn_k.q_input3Arscale = util.global.load @"__auto.blk.23.attn_k.q_input:rscale" : tensor<f32>
%395 = torch_c.from_builtin_tensor %__auto.blk.23.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.23.attn_k.weight3Aqs = util.global.load @"__auto.blk.23.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%396 = torch_c.from_builtin_tensor %__auto.blk.23.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.23.attn_v.q_input3Arscale = util.global.load @"__auto.blk.23.attn_v.q_input:rscale" : tensor<f32>
%397 = torch_c.from_builtin_tensor %__auto.blk.23.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.23.attn_v.weight3Aqs = util.global.load @"__auto.blk.23.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%398 = torch_c.from_builtin_tensor %__auto.blk.23.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.23.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.23.kv_cache.quantizer:rscale" : tensor<f32>
%399 = torch_c.from_builtin_tensor %__auto.blk.23.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.23.attn_output.q_input3Arscale = util.global.load @"__auto.blk.23.attn_output.q_input:rscale" : tensor<f32>
%400 = torch_c.from_builtin_tensor %__auto.blk.23.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.23.attn_output.weight3Aqs = util.global.load @"__auto.blk.23.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%401 = torch_c.from_builtin_tensor %__auto.blk.23.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.23.ffn_norm.weight = util.global.load @__auto.blk.23.ffn_norm.weight : tensor<4096xbf16>
%402 = torch_c.from_builtin_tensor %__auto.blk.23.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.23.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.23.ffn_gate.q_input:rscale" : tensor<f32>
%403 = torch_c.from_builtin_tensor %__auto.blk.23.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.23.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.23.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%404 = torch_c.from_builtin_tensor %__auto.blk.23.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.23.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.23.ffn_up.q_input:rscale" : tensor<f32>
%405 = torch_c.from_builtin_tensor %__auto.blk.23.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.23.ffn_up.weight3Aqs = util.global.load @"__auto.blk.23.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%406 = torch_c.from_builtin_tensor %__auto.blk.23.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.23.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.23.ffn_down.q_input:rscale" : tensor<f32>
%407 = torch_c.from_builtin_tensor %__auto.blk.23.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.23.ffn_down.weight3Aqs = util.global.load @"__auto.blk.23.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
%408 = torch_c.from_builtin_tensor %__auto.blk.23.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
%__auto.blk.24.attn_norm.weight = util.global.load @__auto.blk.24.attn_norm.weight : tensor<4096xbf16>
%409 = torch_c.from_builtin_tensor %__auto.blk.24.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.24.attn_q.q_input3Arscale = util.global.load @"__auto.blk.24.attn_q.q_input:rscale" : tensor<f32>
%410 = torch_c.from_builtin_tensor %__auto.blk.24.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.24.attn_q.weight3Aqs = util.global.load @"__auto.blk.24.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%411 = torch_c.from_builtin_tensor %__auto.blk.24.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.24.attn_k.q_input3Arscale = util.global.load @"__auto.blk.24.attn_k.q_input:rscale" : tensor<f32>
%412 = torch_c.from_builtin_tensor %__auto.blk.24.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.24.attn_k.weight3Aqs = util.global.load @"__auto.blk.24.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%413 = torch_c.from_builtin_tensor %__auto.blk.24.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.24.attn_v.q_input3Arscale = util.global.load @"__auto.blk.24.attn_v.q_input:rscale" : tensor<f32>
%414 = torch_c.from_builtin_tensor %__auto.blk.24.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.24.attn_v.weight3Aqs = util.global.load @"__auto.blk.24.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%415 = torch_c.from_builtin_tensor %__auto.blk.24.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.24.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.24.kv_cache.quantizer:rscale" : tensor<f32>
%416 = torch_c.from_builtin_tensor %__auto.blk.24.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.24.attn_output.q_input3Arscale = util.global.load @"__auto.blk.24.attn_output.q_input:rscale" : tensor<f32>
%417 = torch_c.from_builtin_tensor %__auto.blk.24.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.24.attn_output.weight3Aqs = util.global.load @"__auto.blk.24.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%418 = torch_c.from_builtin_tensor %__auto.blk.24.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.24.ffn_norm.weight = util.global.load @__auto.blk.24.ffn_norm.weight : tensor<4096xbf16>
%419 = torch_c.from_builtin_tensor %__auto.blk.24.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.24.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.24.ffn_gate.q_input:rscale" : tensor<f32>
%420 = torch_c.from_builtin_tensor %__auto.blk.24.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.24.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.24.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%421 = torch_c.from_builtin_tensor %__auto.blk.24.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.24.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.24.ffn_up.q_input:rscale" : tensor<f32>
%422 = torch_c.from_builtin_tensor %__auto.blk.24.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.24.ffn_up.weight3Aqs = util.global.load @"__auto.blk.24.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%423 = torch_c.from_builtin_tensor %__auto.blk.24.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.24.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.24.ffn_down.q_input:rscale" : tensor<f32>
%424 = torch_c.from_builtin_tensor %__auto.blk.24.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.24.ffn_down.weight3Aqs = util.global.load @"__auto.blk.24.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
%425 = torch_c.from_builtin_tensor %__auto.blk.24.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
%__auto.blk.25.attn_norm.weight = util.global.load @__auto.blk.25.attn_norm.weight : tensor<4096xbf16>
%426 = torch_c.from_builtin_tensor %__auto.blk.25.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.25.attn_q.q_input3Arscale = util.global.load @"__auto.blk.25.attn_q.q_input:rscale" : tensor<f32>
%427 = torch_c.from_builtin_tensor %__auto.blk.25.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.25.attn_q.weight3Aqs = util.global.load @"__auto.blk.25.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%428 = torch_c.from_builtin_tensor %__auto.blk.25.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.25.attn_k.q_input3Arscale = util.global.load @"__auto.blk.25.attn_k.q_input:rscale" : tensor<f32>
%429 = torch_c.from_builtin_tensor %__auto.blk.25.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.25.attn_k.weight3Aqs = util.global.load @"__auto.blk.25.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%430 = torch_c.from_builtin_tensor %__auto.blk.25.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.25.attn_v.q_input3Arscale = util.global.load @"__auto.blk.25.attn_v.q_input:rscale" : tensor<f32>
%431 = torch_c.from_builtin_tensor %__auto.blk.25.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.25.attn_v.weight3Aqs = util.global.load @"__auto.blk.25.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%432 = torch_c.from_builtin_tensor %__auto.blk.25.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.25.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.25.kv_cache.quantizer:rscale" : tensor<f32>
%433 = torch_c.from_builtin_tensor %__auto.blk.25.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.25.attn_output.q_input3Arscale = util.global.load @"__auto.blk.25.attn_output.q_input:rscale" : tensor<f32>
%434 = torch_c.from_builtin_tensor %__auto.blk.25.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.25.attn_output.weight3Aqs = util.global.load @"__auto.blk.25.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%435 = torch_c.from_builtin_tensor %__auto.blk.25.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.25.ffn_norm.weight = util.global.load @__auto.blk.25.ffn_norm.weight : tensor<4096xbf16>
%436 = torch_c.from_builtin_tensor %__auto.blk.25.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.25.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.25.ffn_gate.q_input:rscale" : tensor<f32>
%437 = torch_c.from_builtin_tensor %__auto.blk.25.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.25.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.25.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%438 = torch_c.from_builtin_tensor %__auto.blk.25.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.25.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.25.ffn_up.q_input:rscale" : tensor<f32>
%439 = torch_c.from_builtin_tensor %__auto.blk.25.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.25.ffn_up.weight3Aqs = util.global.load @"__auto.blk.25.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%440 = torch_c.from_builtin_tensor %__auto.blk.25.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.25.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.25.ffn_down.q_input:rscale" : tensor<f32>
%441 = torch_c.from_builtin_tensor %__auto.blk.25.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.25.ffn_down.weight3Aqs = util.global.load @"__auto.blk.25.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
%442 = torch_c.from_builtin_tensor %__auto.blk.25.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
%__auto.blk.26.attn_norm.weight = util.global.load @__auto.blk.26.attn_norm.weight : tensor<4096xbf16>
%443 = torch_c.from_builtin_tensor %__auto.blk.26.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.26.attn_q.q_input3Arscale = util.global.load @"__auto.blk.26.attn_q.q_input:rscale" : tensor<f32>
%444 = torch_c.from_builtin_tensor %__auto.blk.26.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.26.attn_q.weight3Aqs = util.global.load @"__auto.blk.26.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%445 = torch_c.from_builtin_tensor %__auto.blk.26.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.26.attn_k.q_input3Arscale = util.global.load @"__auto.blk.26.attn_k.q_input:rscale" : tensor<f32>
%446 = torch_c.from_builtin_tensor %__auto.blk.26.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.26.attn_k.weight3Aqs = util.global.load @"__auto.blk.26.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%447 = torch_c.from_builtin_tensor %__auto.blk.26.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.26.attn_v.q_input3Arscale = util.global.load @"__auto.blk.26.attn_v.q_input:rscale" : tensor<f32>
%448 = torch_c.from_builtin_tensor %__auto.blk.26.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.26.attn_v.weight3Aqs = util.global.load @"__auto.blk.26.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%449 = torch_c.from_builtin_tensor %__auto.blk.26.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.26.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.26.kv_cache.quantizer:rscale" : tensor<f32>
%450 = torch_c.from_builtin_tensor %__auto.blk.26.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.26.attn_output.q_input3Arscale = util.global.load @"__auto.blk.26.attn_output.q_input:rscale" : tensor<f32>
%451 = torch_c.from_builtin_tensor %__auto.blk.26.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.26.attn_output.weight3Aqs = util.global.load @"__auto.blk.26.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%452 = torch_c.from_builtin_tensor %__auto.blk.26.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.26.ffn_norm.weight = util.global.load @__auto.blk.26.ffn_norm.weight : tensor<4096xbf16>
%453 = torch_c.from_builtin_tensor %__auto.blk.26.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.26.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.26.ffn_gate.q_input:rscale" : tensor<f32>
%454 = torch_c.from_builtin_tensor %__auto.blk.26.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.26.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.26.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%455 = torch_c.from_builtin_tensor %__auto.blk.26.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.26.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.26.ffn_up.q_input:rscale" : tensor<f32>
%456 = torch_c.from_builtin_tensor %__auto.blk.26.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.26.ffn_up.weight3Aqs = util.global.load @"__auto.blk.26.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%457 = torch_c.from_builtin_tensor %__auto.blk.26.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.26.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.26.ffn_down.q_input:rscale" : tensor<f32>
%458 = torch_c.from_builtin_tensor %__auto.blk.26.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.26.ffn_down.weight3Aqs = util.global.load @"__auto.blk.26.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
%459 = torch_c.from_builtin_tensor %__auto.blk.26.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
%__auto.blk.27.attn_norm.weight = util.global.load @__auto.blk.27.attn_norm.weight : tensor<4096xbf16>
%460 = torch_c.from_builtin_tensor %__auto.blk.27.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.27.attn_q.q_input3Arscale = util.global.load @"__auto.blk.27.attn_q.q_input:rscale" : tensor<f32>
%461 = torch_c.from_builtin_tensor %__auto.blk.27.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.27.attn_q.weight3Aqs = util.global.load @"__auto.blk.27.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%462 = torch_c.from_builtin_tensor %__auto.blk.27.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.27.attn_k.q_input3Arscale = util.global.load @"__auto.blk.27.attn_k.q_input:rscale" : tensor<f32>
%463 = torch_c.from_builtin_tensor %__auto.blk.27.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.27.attn_k.weight3Aqs = util.global.load @"__auto.blk.27.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%464 = torch_c.from_builtin_tensor %__auto.blk.27.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.27.attn_v.q_input3Arscale = util.global.load @"__auto.blk.27.attn_v.q_input:rscale" : tensor<f32>
%465 = torch_c.from_builtin_tensor %__auto.blk.27.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.27.attn_v.weight3Aqs = util.global.load @"__auto.blk.27.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%466 = torch_c.from_builtin_tensor %__auto.blk.27.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.27.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.27.kv_cache.quantizer:rscale" : tensor<f32>
%467 = torch_c.from_builtin_tensor %__auto.blk.27.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.27.attn_output.q_input3Arscale = util.global.load @"__auto.blk.27.attn_output.q_input:rscale" : tensor<f32>
%468 = torch_c.from_builtin_tensor %__auto.blk.27.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.27.attn_output.weight3Aqs = util.global.load @"__auto.blk.27.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%469 = torch_c.from_builtin_tensor %__auto.blk.27.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.27.ffn_norm.weight = util.global.load @__auto.blk.27.ffn_norm.weight : tensor<4096xbf16>
%470 = torch_c.from_builtin_tensor %__auto.blk.27.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.27.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.27.ffn_gate.q_input:rscale" : tensor<f32>
%471 = torch_c.from_builtin_tensor %__auto.blk.27.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.27.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.27.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%472 = torch_c.from_builtin_tensor %__auto.blk.27.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.27.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.27.ffn_up.q_input:rscale" : tensor<f32>
%473 = torch_c.from_builtin_tensor %__auto.blk.27.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.27.ffn_up.weight3Aqs = util.global.load @"__auto.blk.27.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%474 = torch_c.from_builtin_tensor %__auto.blk.27.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.27.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.27.ffn_down.q_input:rscale" : tensor<f32>
%475 = torch_c.from_builtin_tensor %__auto.blk.27.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.27.ffn_down.weight3Aqs = util.global.load @"__auto.blk.27.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
%476 = torch_c.from_builtin_tensor %__auto.blk.27.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
%__auto.blk.28.attn_norm.weight = util.global.load @__auto.blk.28.attn_norm.weight : tensor<4096xbf16>
%477 = torch_c.from_builtin_tensor %__auto.blk.28.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.28.attn_q.q_input3Arscale = util.global.load @"__auto.blk.28.attn_q.q_input:rscale" : tensor<f32>
%478 = torch_c.from_builtin_tensor %__auto.blk.28.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.28.attn_q.weight3Aqs = util.global.load @"__auto.blk.28.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%479 = torch_c.from_builtin_tensor %__auto.blk.28.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.28.attn_k.q_input3Arscale = util.global.load @"__auto.blk.28.attn_k.q_input:rscale" : tensor<f32>
%480 = torch_c.from_builtin_tensor %__auto.blk.28.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.28.attn_k.weight3Aqs = util.global.load @"__auto.blk.28.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%481 = torch_c.from_builtin_tensor %__auto.blk.28.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.28.attn_v.q_input3Arscale = util.global.load @"__auto.blk.28.attn_v.q_input:rscale" : tensor<f32>
%482 = torch_c.from_builtin_tensor %__auto.blk.28.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.28.attn_v.weight3Aqs = util.global.load @"__auto.blk.28.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%483 = torch_c.from_builtin_tensor %__auto.blk.28.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.28.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.28.kv_cache.quantizer:rscale" : tensor<f32>
%484 = torch_c.from_builtin_tensor %__auto.blk.28.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.28.attn_output.q_input3Arscale = util.global.load @"__auto.blk.28.attn_output.q_input:rscale" : tensor<f32>
%485 = torch_c.from_builtin_tensor %__auto.blk.28.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.28.attn_output.weight3Aqs = util.global.load @"__auto.blk.28.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%486 = torch_c.from_builtin_tensor %__auto.blk.28.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.28.ffn_norm.weight = util.global.load @__auto.blk.28.ffn_norm.weight : tensor<4096xbf16>
%487 = torch_c.from_builtin_tensor %__auto.blk.28.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.28.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.28.ffn_gate.q_input:rscale" : tensor<f32>
%488 = torch_c.from_builtin_tensor %__auto.blk.28.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.28.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.28.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%489 = torch_c.from_builtin_tensor %__auto.blk.28.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.28.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.28.ffn_up.q_input:rscale" : tensor<f32>
%490 = torch_c.from_builtin_tensor %__auto.blk.28.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.28.ffn_up.weight3Aqs = util.global.load @"__auto.blk.28.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%491 = torch_c.from_builtin_tensor %__auto.blk.28.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.28.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.28.ffn_down.q_input:rscale" : tensor<f32>
%492 = torch_c.from_builtin_tensor %__auto.blk.28.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.28.ffn_down.weight3Aqs = util.global.load @"__auto.blk.28.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
%493 = torch_c.from_builtin_tensor %__auto.blk.28.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
%__auto.blk.29.attn_norm.weight = util.global.load @__auto.blk.29.attn_norm.weight : tensor<4096xbf16>
%494 = torch_c.from_builtin_tensor %__auto.blk.29.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.29.attn_q.q_input3Arscale = util.global.load @"__auto.blk.29.attn_q.q_input:rscale" : tensor<f32>
%495 = torch_c.from_builtin_tensor %__auto.blk.29.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.29.attn_q.weight3Aqs = util.global.load @"__auto.blk.29.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%496 = torch_c.from_builtin_tensor %__auto.blk.29.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.29.attn_k.q_input3Arscale = util.global.load @"__auto.blk.29.attn_k.q_input:rscale" : tensor<f32>
%497 = torch_c.from_builtin_tensor %__auto.blk.29.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.29.attn_k.weight3Aqs = util.global.load @"__auto.blk.29.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%498 = torch_c.from_builtin_tensor %__auto.blk.29.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.29.attn_v.q_input3Arscale = util.global.load @"__auto.blk.29.attn_v.q_input:rscale" : tensor<f32>
%499 = torch_c.from_builtin_tensor %__auto.blk.29.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.29.attn_v.weight3Aqs = util.global.load @"__auto.blk.29.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%500 = torch_c.from_builtin_tensor %__auto.blk.29.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.29.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.29.kv_cache.quantizer:rscale" : tensor<f32>
%501 = torch_c.from_builtin_tensor %__auto.blk.29.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.29.attn_output.q_input3Arscale = util.global.load @"__auto.blk.29.attn_output.q_input:rscale" : tensor<f32>
%502 = torch_c.from_builtin_tensor %__auto.blk.29.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.29.attn_output.weight3Aqs = util.global.load @"__auto.blk.29.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%503 = torch_c.from_builtin_tensor %__auto.blk.29.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.29.ffn_norm.weight = util.global.load @__auto.blk.29.ffn_norm.weight : tensor<4096xbf16>
%504 = torch_c.from_builtin_tensor %__auto.blk.29.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.29.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.29.ffn_gate.q_input:rscale" : tensor<f32>
%505 = torch_c.from_builtin_tensor %__auto.blk.29.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.29.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.29.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%506 = torch_c.from_builtin_tensor %__auto.blk.29.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.29.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.29.ffn_up.q_input:rscale" : tensor<f32>
%507 = torch_c.from_builtin_tensor %__auto.blk.29.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.29.ffn_up.weight3Aqs = util.global.load @"__auto.blk.29.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%508 = torch_c.from_builtin_tensor %__auto.blk.29.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.29.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.29.ffn_down.q_input:rscale" : tensor<f32>
%509 = torch_c.from_builtin_tensor %__auto.blk.29.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.29.ffn_down.weight3Aqs = util.global.load @"__auto.blk.29.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
%510 = torch_c.from_builtin_tensor %__auto.blk.29.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
%__auto.blk.30.attn_norm.weight = util.global.load @__auto.blk.30.attn_norm.weight : tensor<4096xbf16>
%511 = torch_c.from_builtin_tensor %__auto.blk.30.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.30.attn_q.q_input3Arscale = util.global.load @"__auto.blk.30.attn_q.q_input:rscale" : tensor<f32>
%512 = torch_c.from_builtin_tensor %__auto.blk.30.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.30.attn_q.weight3Aqs = util.global.load @"__auto.blk.30.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%513 = torch_c.from_builtin_tensor %__auto.blk.30.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.30.attn_k.q_input3Arscale = util.global.load @"__auto.blk.30.attn_k.q_input:rscale" : tensor<f32>
%514 = torch_c.from_builtin_tensor %__auto.blk.30.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.30.attn_k.weight3Aqs = util.global.load @"__auto.blk.30.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%515 = torch_c.from_builtin_tensor %__auto.blk.30.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.30.attn_v.q_input3Arscale = util.global.load @"__auto.blk.30.attn_v.q_input:rscale" : tensor<f32>
%516 = torch_c.from_builtin_tensor %__auto.blk.30.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.30.attn_v.weight3Aqs = util.global.load @"__auto.blk.30.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%517 = torch_c.from_builtin_tensor %__auto.blk.30.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.30.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.30.kv_cache.quantizer:rscale" : tensor<f32>
%518 = torch_c.from_builtin_tensor %__auto.blk.30.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.30.attn_output.q_input3Arscale = util.global.load @"__auto.blk.30.attn_output.q_input:rscale" : tensor<f32>
%519 = torch_c.from_builtin_tensor %__auto.blk.30.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.30.attn_output.weight3Aqs = util.global.load @"__auto.blk.30.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%520 = torch_c.from_builtin_tensor %__auto.blk.30.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.30.ffn_norm.weight = util.global.load @__auto.blk.30.ffn_norm.weight : tensor<4096xbf16>
%521 = torch_c.from_builtin_tensor %__auto.blk.30.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.30.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.30.ffn_gate.q_input:rscale" : tensor<f32>
%522 = torch_c.from_builtin_tensor %__auto.blk.30.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.30.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.30.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%523 = torch_c.from_builtin_tensor %__auto.blk.30.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.30.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.30.ffn_up.q_input:rscale" : tensor<f32>
%524 = torch_c.from_builtin_tensor %__auto.blk.30.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.30.ffn_up.weight3Aqs = util.global.load @"__auto.blk.30.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%525 = torch_c.from_builtin_tensor %__auto.blk.30.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.30.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.30.ffn_down.q_input:rscale" : tensor<f32>
%526 = torch_c.from_builtin_tensor %__auto.blk.30.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.30.ffn_down.weight3Aqs = util.global.load @"__auto.blk.30.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
%527 = torch_c.from_builtin_tensor %__auto.blk.30.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
%__auto.blk.31.attn_norm.weight = util.global.load @__auto.blk.31.attn_norm.weight : tensor<4096xbf16>
%528 = torch_c.from_builtin_tensor %__auto.blk.31.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.31.attn_q.q_input3Arscale = util.global.load @"__auto.blk.31.attn_q.q_input:rscale" : tensor<f32>
%529 = torch_c.from_builtin_tensor %__auto.blk.31.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.31.attn_q.weight3Aqs = util.global.load @"__auto.blk.31.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%530 = torch_c.from_builtin_tensor %__auto.blk.31.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.31.attn_k.q_input3Arscale = util.global.load @"__auto.blk.31.attn_k.q_input:rscale" : tensor<f32>
%531 = torch_c.from_builtin_tensor %__auto.blk.31.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.31.attn_k.weight3Aqs = util.global.load @"__auto.blk.31.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%532 = torch_c.from_builtin_tensor %__auto.blk.31.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.31.attn_v.q_input3Arscale = util.global.load @"__auto.blk.31.attn_v.q_input:rscale" : tensor<f32>
%533 = torch_c.from_builtin_tensor %__auto.blk.31.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.31.attn_v.weight3Aqs = util.global.load @"__auto.blk.31.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ>
%534 = torch_c.from_builtin_tensor %__auto.blk.31.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ>
%__auto.blk.31.kv_cache.quantizer3Arscale = util.global.load @"__auto.blk.31.kv_cache.quantizer:rscale" : tensor<f32>
%535 = torch_c.from_builtin_tensor %__auto.blk.31.kv_cache.quantizer3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.31.attn_output.q_input3Arscale = util.global.load @"__auto.blk.31.attn_output.q_input:rscale" : tensor<f32>
%536 = torch_c.from_builtin_tensor %__auto.blk.31.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.31.attn_output.weight3Aqs = util.global.load @"__auto.blk.31.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ>
%537 = torch_c.from_builtin_tensor %__auto.blk.31.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ>
%__auto.blk.31.ffn_norm.weight = util.global.load @__auto.blk.31.ffn_norm.weight : tensor<4096xbf16>
%538 = torch_c.from_builtin_tensor %__auto.blk.31.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.blk.31.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.31.ffn_gate.q_input:rscale" : tensor<f32>
%539 = torch_c.from_builtin_tensor %__auto.blk.31.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.31.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.31.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%540 = torch_c.from_builtin_tensor %__auto.blk.31.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.31.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.31.ffn_up.q_input:rscale" : tensor<f32>
%541 = torch_c.from_builtin_tensor %__auto.blk.31.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.31.ffn_up.weight3Aqs = util.global.load @"__auto.blk.31.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ>
%542 = torch_c.from_builtin_tensor %__auto.blk.31.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ>
%__auto.blk.31.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.31.ffn_down.q_input:rscale" : tensor<f32>
%543 = torch_c.from_builtin_tensor %__auto.blk.31.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32>
%__auto.blk.31.ffn_down.weight3Aqs = util.global.load @"__auto.blk.31.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ>
%544 = torch_c.from_builtin_tensor %__auto.blk.31.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ>
%__auto.output_norm.weight = util.global.load @__auto.output_norm.weight : tensor<4096xbf16>
%545 = torch_c.from_builtin_tensor %__auto.output_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16>
%__auto.output.weight = util.global.load @__auto.output.weight : tensor<128256x4096xbf16>
%546 = torch_c.from_builtin_tensor %__auto.output.weight : tensor<128256x4096xbf16> -> !torch.vtensor<[128256,4096],bf16>
%547 = torch.copy.to_vtensor %arg3 : !torch.vtensor<[?,2097152],f8E4M3FNUZ>
%548 = torch.symbolic_int "s1" {min_val = 2, max_val = 4095} : !torch.int
%549 = torch.symbolic_int "s2" {min_val = 2, max_val = 9223372036854775806} : !torch.int
torch.bind_symbolic_shape %arg0, [%548], affine_map<()[s0] -> (1, s0 * 32)> : !torch.vtensor<[1,?],si64>
torch.bind_symbolic_shape %arg2, [%548], affine_map<()[s0] -> (1, s0)> : !torch.vtensor<[1,?],si64>
torch.bind_symbolic_shape %547, [%549], affine_map<()[s0] -> (s0, 2097152)> : !torch.vtensor<[?,2097152],f8E4M3FNUZ>
%int-1 = torch.constant.int -1
%false = torch.constant.bool false
%false_0 = torch.constant.bool false
%550 = torch.aten.embedding %0, %arg0, %int-1, %false, %false_0 : !torch.vtensor<[128256,4096],bf16>, !torch.vtensor<[1,?],si64>, !torch.int, !torch.bool, !torch.bool -> !torch.vtensor<[1,?,4096],bf16>
torch.bind_symbolic_shape %550, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
%int6 = torch.constant.int 6
%551 = torch.prims.convert_element_type %550, %int6 : !torch.vtensor<[1,?,4096],bf16>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %551, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%int2 = torch.constant.int 2
%552 = torch.aten.pow.Tensor_Scalar %551, %int2 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %552, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%int-1_1 = torch.constant.int -1
%553 = torch.prim.ListConstruct %int-1_1 : (!torch.int) -> !torch.list<int>
%true = torch.constant.bool true
%none = torch.constant.none
%554 = torch.aten.mean.dim %552, %553, %true, %none : !torch.vtensor<[1,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,?,1],f32>
torch.bind_symbolic_shape %554, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
%float1.000000e-05 = torch.constant.float 1.000000e-05
%int1 = torch.constant.int 1
%555 = torch.aten.add.Scalar %554, %float1.000000e-05, %int1 : !torch.vtensor<[1,?,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,?,1],f32>
torch.bind_symbolic_shape %555, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
%556 = torch.aten.rsqrt %555 : !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,1],f32>
torch.bind_symbolic_shape %556, [%548], affine_map<()[s0] -> (1, s0 * 32, 1)> : !torch.vtensor<[1,?,1],f32>
%557 = torch.aten.mul.Tensor %551, %556 : !torch.vtensor<[1,?,4096],f32>, !torch.vtensor<[1,?,1],f32> -> !torch.vtensor<[1,?,4096],f32>
torch.bind_symbolic_shape %557, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],f32>
%int15 = torch.constant.int 15
%558 = torch.prims.convert_element_type %557, %int15 : !torch.vtensor<[1,?,4096],f32>, !torch.int -> !torch.vtensor<[1,?,4096],bf16>
torch.bind_symbolic_shape %558, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
%559 = torch.aten.mul.Tensor %1, %558 : !torch.vtensor<[4096],bf16>, !torch.vtensor<[1,?,4096],bf16> -> !torch.vtensor<[1,?,4096],bf16>
torch.bind_symbolic_shape %559, [%548], affine_map<()[s0] -> (1, s0 * 32, 4096)> : !torch.vtensor<[1,?,4096],bf16>
%560 = torch.aten.div.Tensor %559, %2 : !torch.vtensor<[1,?,4096],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[1,?,4096],bf16>
return %0 : !torch.vtensor<[128256,4096],bf16>
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment