Skip to content

Instantly share code, notes, and snippets.

@pashu123
Created April 30, 2024 16:12
Show Gist options
  • Save pashu123/45fe64caa21cfdfa9890698660184a44 to your computer and use it in GitHub Desktop.
Save pashu123/45fe64caa21cfdfa9890698660184a44 to your computer and use it in GitHub Desktop.
#map = affine_map<(d0, d1, d2) -> (d1, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
module @module {
util.global private @__auto.token_embd.weight = #stream.parameter.named<"model"::"token_embd.weight"> : tensor<128256x4096xf16>
util.global private @__auto.blk.0.attn_norm.weight = #stream.parameter.named<"model"::"blk.0.attn_norm.weight"> : tensor<4096xf32>
util.global private @__auto.blk.0.attn_q.weight = #stream.parameter.named<"model"::"blk.0.attn_q.weight"> : tensor<4096x4096xf16>
util.global private @__auto.blk.0.attn_k.weight = #stream.parameter.named<"model"::"blk.0.attn_k.weight"> : tensor<1024x4096xf16>
util.global private @__auto.blk.0.attn_v.weight = #stream.parameter.named<"model"::"blk.0.attn_v.weight"> : tensor<1024x4096xf16>
util.global private @__auto.constant_8192_64_torch.complex64 = dense_resource<__auto.constant_8192_64_torch.complex64> : tensor<8192x64xcomplex<f32>>
util.global private @__auto.blk.0.attn_output.weight = #stream.parameter.named<"model"::"blk.0.attn_output.weight"> : tensor<4096x4096xf16>
util.global private @__auto.blk.0.ffn_norm.weight = #stream.parameter.named<"model"::"blk.0.ffn_norm.weight"> : tensor<4096xf32>
util.global private @__auto.blk.0.ffn_gate.weight = #stream.parameter.named<"model"::"blk.0.ffn_gate.weight"> : tensor<14336x4096xf16>
util.global private @__auto.blk.0.ffn_up.weight = #stream.parameter.named<"model"::"blk.0.ffn_up.weight"> : tensor<14336x4096xf16>
util.global private @__auto.blk.0.ffn_down.weight = #stream.parameter.named<"model"::"blk.0.ffn_down.weight"> : tensor<4096x14336xf16>
util.global private @__auto.blk.1.attn_norm.weight = #stream.parameter.named<"model"::"blk.1.attn_norm.weight"> : tensor<4096xf32>
util.global private @__auto.blk.1.attn_q.weight = #stream.parameter.named<"model"::"blk.1.attn_q.weight"> : tensor<4096x4096xf16>
util.global private @__auto.blk.1.attn_k.weight = #stream.parameter.named<"model"::"blk.1.attn_k.weight"> : tensor<1024x4096xf16>
util.global private @__auto.blk.1.attn_v.weight = #stream.parameter.named<"model"::"blk.1.attn_v.weight"> : tensor<1024x4096xf16>
util.global private @__auto.blk.1.attn_output.weight = #stream.parameter.named<"model"::"blk.1.attn_output.weight"> : tensor<4096x4096xf16>
util.global private @__auto.blk.1.ffn_norm.weight = #stream.parameter.named<"model"::"blk.1.ffn_norm.weight"> : tensor<4096xf32>
util.global private @__auto.blk.1.ffn_gate.weight = #stream.parameter.named<"model"::"blk.1.ffn_gate.weight"> : tensor<14336x4096xf16>
util.global private @__auto.blk.1.ffn_up.weight = #stream.parameter.named<"model"::"blk.1.ffn_up.weight"> : tensor<14336x4096xf16>
util.global private @__auto.blk.1.ffn_down.weight = #stream.parameter.named<"model"::"blk.1.ffn_down.weight"> : tensor<4096x14336xf16>
util.global private @__auto.blk.2.attn_norm.weight = #stream.parameter.named<"model"::"blk.2.attn_norm.weight"> : tensor<4096xf32>
util.global private @__auto.blk.2.attn_q.weight = #stream.parameter.named<"model"::"blk.2.attn_q.weight"> : tensor<4096x4096xf16>
util.global private @__auto.blk.2.attn_k.weight = #stream.parameter.named<"model"::"blk.2.attn_k.weight"> : tensor<1024x4096xf16>
util.global private @__auto.blk.2.attn_v.weight = #stream.parameter.named<"model"::"blk.2.attn_v.weight"> : tensor<1024x4096xf16>
util.global private @__auto.blk.2.attn_output.weight = #stream.parameter.named<"model"::"blk.2.attn_output.weight"> : tensor<4096x4096xf16>
util.global private @__auto.blk.2.ffn_norm.weight = #stream.parameter.named<"model"::"blk.2.ffn_norm.weight"> : tensor<4096xf32>
util.global private @__auto.blk.2.ffn_gate.weight = #stream.parameter.named<"model"::"blk.2.ffn_gate.weight"> : tensor<14336x4096xf16>
util.global private @__auto.blk.2.ffn_up.weight = #stream.parameter.named<"model"::"blk.2.ffn_up.weight"> : tensor<14336x4096xf16>
util.global private @__auto.blk.2.ffn_down.weight = #stream.parameter.named<"model"::"blk.2.ffn_down.weight"> : tensor<4096x14336xf16>
util.global private @__auto.blk.3.attn_norm.weight = #stream.parameter.named<"model"::"blk.3.attn_norm.weight"> : tensor<4096xf32>
util.global private @__auto.blk.3.attn_q.weight = #stream.parameter.named<"model"::"blk.3.attn_q.weight"> : tensor<4096x4096xf16>
util.global private @__auto.blk.3.attn_k.weight = #stream.parameter.named<"model"::"blk.3.attn_k.weight"> : tensor<1024x4096xf16>
util.global private @__auto.blk.3.attn_v.weight = #stream.parameter.named<"model"::"blk.3.attn_v.weight"> : tensor<1024x4096xf16>
util.global private @__auto.blk.3.attn_output.weight = #stream.parameter.named<"model"::"blk.3.attn_output.weight"> : tensor<4096x4096xf16>
util.global private @__auto.blk.3.ffn_norm.weight = #stream.parameter.named<"model"::"blk.3.ffn_norm.weight"> : tensor<4096xf32>
util.global private @__auto.blk.3.ffn_gate.weight = #stream.parameter.named<"model"::"blk.3.ffn_gate.weight"> : tensor<14336x4096xf16>
util.global private @__auto.blk.3.ffn_up.weight = #stream.parameter.named<"model"::"blk.3.ffn_up.weight"> : tensor<14336x4096xf16>
util.global private @__auto.blk.3.ffn_down.weight = #stream.parameter.named<"model"::"blk.3.ffn_down.weight"> : tensor<4096x14336xf16>
util.global private @__auto.blk.4.attn_norm.weight = #stream.parameter.named<"model"::"blk.4.attn_norm.weight"> : tensor<4096xf32>
util.global private @__auto.blk.4.attn_q.weight = #stream.parameter.named<"model"::"blk.4.attn_q.weight"> : tensor<4096x4096xf16>
util.global private @__auto.blk.4.attn_k.weight = #stream.parameter.named<"model"::"blk.4.attn_k.weight"> : tensor<1024x4096xf16>
util.global private @__auto.blk.4.attn_v.weight = #stream.parameter.named<"model"::"blk.4.attn_v.weight"> : tensor<1024x4096xf16>
util.global private @__auto.blk.4.attn_output.weight = #stream.parameter.named<"model"::"blk.4.attn_output.weight"> : tensor<4096x4096xf16>
util.global private @__auto.blk.4.ffn_norm.weight = #stream.parameter.named<"model"::"blk.4.ffn_norm.weight"> : tensor<4096xf32>
util.global private @__auto.blk.4.ffn_gate.weight = #stream.parameter.named<"model"::"blk.4.ffn_gate.weight"> : tensor<14336x4096xf16>
util.global private @__auto.blk.4.ffn_up.weight = #stream.parameter.named<"model"::"blk.4.ffn_up.weight"> : tensor<14336x4096xf16>
util.global private @__auto.blk.4.ffn_down.weight = #stream.parameter.named<"model"::"blk.4.ffn_down.weight"> : tensor<4096x14336xf16>
util.global private @__auto.blk.5.attn_norm.weight = #stream.parameter.named<"model"::"blk.5.attn_norm.weight"> : tensor<4096xf32>
util.global private @__auto.blk.5.attn_q.weight = #stream.parameter.named<"model"::"blk.5.attn_q.weight"> : tensor<4096x4096xf16>
util.global private @__auto.blk.5.attn_k.weight = #stream.parameter.named<"model"::"blk.5.attn_k.weight"> : tensor<1024x4096xf16>
util.global private @__auto.blk.5.attn_v.weight = #stream.parameter.named<"model"::"blk.5.attn_v.weight"> : tensor<1024x4096xf16>
util.global private @__auto.blk.5.attn_output.weight = #stream.parameter.named<"model"::"blk.5.attn_output.weight"> : tensor<4096x4096xf16>
util.global private @__auto.blk.5.ffn_norm.weight = #stream.parameter.named<"model"::"blk.5.ffn_norm.weight"> : tensor<4096xf32>
util.global private @__auto.blk.5.ffn_gate.weight = #stream.parameter.named<"model"::"blk.5.ffn_gate.weight"> : tensor<14336x4096xf16>
util.global private @__auto.blk.5.ffn_up.weight = #stream.parameter.named<"model"::"blk.5.ffn_up.weight"> : tensor<14336x4096xf16>
util.global private @__auto.blk.5.ffn_down.weight = #stream.parameter.named<"model"::"blk.5.ffn_down.weight"> : tensor<4096x14336xf16>
util.global private @__auto.blk.6.attn_norm.weight = #stream.parameter.named<"model"::"blk.6.attn_norm.weight"> : tensor<4096xf32>
util.global private @__auto.blk.6.attn_q.weight = #stream.parameter.named<"model"::"blk.6.attn_q.weight"> : tensor<4096x4096xf16>
util.global private @__auto.blk.6.attn_k.weight = #stream.parameter.named<"model"::"blk.6.attn_k.weight"> : tensor<1024x4096xf16>
util.global private @__auto.blk.6.attn_v.weight = #stream.parameter.named<"model"::"blk.6.attn_v.weight"> : tensor<1024x4096xf16>
util.global private @__auto.blk.6.attn_output.weight = #stream.parameter.named<"model"::"blk.6.attn_output.weight"> : tensor<4096x4096xf16>
util.global private @__auto.blk.6.ffn_norm.weight = #stream.parameter.named<"model"::"blk.6.ffn_norm.weight"> : tensor<4096xf32>
util.global private @__auto.blk.6.ffn_gate.weight = #stream.parameter.named<"model"::"blk.6.ffn_gate.weight"> : tensor<14336x4096xf16>
util.global private @__auto.blk.6.ffn_up.weight = #stream.parameter.named<"model"::"blk.6.ffn_up.weight"> : tensor<14336x4096xf16>
util.global private @__auto.blk.6.ffn_down.weight = #stream.parameter.named<"model"::"blk.6.ffn_down.weight"> : tensor<4096x14336xf16>
util.global private @__auto.blk.7.attn_norm.weight = #stream.parameter.named<"model"::"blk.7.attn_norm.weight"> : tensor<4096xf32>
util.global private @__auto.blk.7.attn_q.weight = #stream.parameter.named<"model"::"blk.7.attn_q.weight"> : tensor<4096x4096xf16>
util.global private @__auto.blk.7.attn_k.weight = #stream.parameter.named<"model"::"blk.7.attn_k.weight"> : tensor<1024x4096xf16>
util.global private @__auto.blk.7.attn_v.weight = #stream.parameter.named<"model"::"blk.7.attn_v.weight"> : tensor<1024x4096xf16>
util.global private @__auto.blk.7.attn_output.weight = #stream.parameter.named<"model"::"blk.7.attn_output.weight"> : tensor<4096x4096xf16>
util.global private @__auto.blk.7.ffn_norm.weight = #stream.parameter.named<"model"::"blk.7.ffn_norm.weight"> : tensor<4096xf32>
util.global private @__auto.blk.7.ffn_gate.weight = #stream.parameter.named<"model"::"blk.7.ffn_gate.weight"> : tensor<14336x4096xf16>
util.global private @__auto.blk.7.ffn_up.weight = #stream.parameter.named<"model"::"blk.7.ffn_up.weight"> : tensor<14336x4096xf16>
util.global private @__auto.blk.7.ffn_down.weight = #stream.parameter.named<"model"::"blk.7.ffn_down.weight"> : tensor<4096x14336xf16>
util.global private @__auto.blk.8.attn_norm.weight = #stream.parameter.named<"model"::"blk.8.attn_norm.weight"> : tensor<4096xf32>
util.global private @__auto.blk.8.attn_q.weight = #stream.parameter.named<"model"::"blk.8.attn_q.weight"> : tensor<4096x4096xf16>
util.global private @__auto.blk.8.attn_k.weight = #stream.parameter.named<"model"::"blk.8.attn_k.weight"> : tensor<1024x4096xf16>
util.global private @__auto.blk.8.attn_v.weight = #stream.parameter.named<"model"::"blk.8.attn_v.weight"> : tensor<1024x4096xf16>
util.global private @__auto.blk.8.attn_output.weight = #stream.parameter.named<"model"::"blk.8.attn_output.weight"> : tensor<4096x4096xf16>
util.global private @__auto.blk.8.ffn_norm.weight = #stream.parameter.named<"model"::"blk.8.ffn_norm.weight"> : tensor<4096xf32>
util.global private @__auto.blk.8.ffn_gate.weight = #stream.parameter.named<"model"::"blk.8.ffn_gate.weight"> : tensor<14336x4096xf16>
util.global private @__auto.blk.8.ffn_up.weight = #stream.parameter.named<"model"::"blk.8.ffn_up.weight"> : tensor<14336x4096xf16>
util.global private @__auto.blk.8.ffn_down.weight = #stream.parameter.named<"model"::"blk.8.ffn_down.weight"> : tensor<4096x14336xf16>
util.global private @__auto.blk.9.attn_norm.weight = #stream.parameter.named<"model"::"blk.9.attn_norm.weight"> : tensor<4096xf32>
util.global private @__auto.blk.9.attn_q.weight = #stream.parameter.named<"model"::"blk.9.attn_q.weight"> : tensor<4096x4096xf16>
util.global private @__auto.blk.9.attn_k.weight = #stream.parameter.named<"model"::"blk.9.attn_k.weight"> : tensor<1024x4096xf16>
util.global private @__auto.blk.9.attn_v.weight = #stream.parameter.named<"model"::"blk.9.attn_v.weight"> : tensor<1024x4096xf16>
util.global private @__auto.blk.9.attn_output.weight = #stream.parameter.named<"model"::"blk.9.attn_output.weight"> : tensor<4096x4096xf16>
util.global private @__auto.blk.9.ffn_norm.weight = #stream.parameter.named<"model"::"blk.9.ffn_norm.weight"> : tensor<4096xf32>
util.global private @__auto.blk.9.ffn_gate.weight = #stream.parameter.named<"model"::"blk.9.ffn_gate.weight"> : tensor<14336x4096xf16>
util.global private @__auto.blk.9.ffn_up.weight = #stream.parameter.named<"model"::"blk.9.ffn_up.weight"> : tensor<14336x4096xf16>
util.global private @__auto.blk.9.ffn_down.weight = #stream.parameter.named<"model"::"blk.9.ffn_down.weight"> : tensor<4096x14336xf16>
util.global private @__auto.blk.10.attn_norm.weight = #stream.parameter.named<"model"::"blk.10.attn_norm.weight"> : tensor<4096xf32>
util.global private @__auto.blk.10.attn_q.weight = #stream.parameter.named<"model"::"blk.10.attn_q.weight"> : tensor<4096x4096xf16>
util.global private @__auto.blk.10.attn_k.weight = #stream.parameter.named<"model"::"blk.10.attn_k.weight"> : tensor<1024x4096xf16>
util.global private @__auto.blk.10.attn_v.weight = #stream.parameter.named<"model"::"blk.10.attn_v.weight"> : tensor<1024x4096xf16>
util.global private @__auto.blk.10.attn_output.weight = #stream.parameter.named<"model"::"blk.10.attn_output.weight"> : tensor<4096x4096xf16>
util.global private @__auto.blk.10.ffn_norm.weight = #stream.parameter.named<"model"::"blk.10.ffn_norm.weight"> : tensor<4096xf32>
util.global private @__auto.blk.10.ffn_gate.weight = #stream.parameter.named<"model"::"blk.10.ffn_gate.weight"> : tensor<14336x4096xf16>
util.global private @__auto.blk.10.ffn_up.weight = #stream.parameter.named<"model"::"blk.10.ffn_up.weight"> : tensor<14336x4096xf16>
util.global private @__auto.blk.10.ffn_down.weight = #stream.parameter.named<"model"::"blk.10.ffn_down.weight"> : tensor<4096x14336xf16>
util.global private @__auto.blk.11.attn_norm.weight = #stream.parameter.named<"model"::"blk.11.attn_norm.weight"> : tensor<4096xf32>
util.global private @__auto.blk.11.attn_q.weight = #stream.parameter.named<"model"::"blk.11.attn_q.weight"> : tensor<4096x4096xf16>
util.global private @__auto.blk.11.attn_k.weight = #stream.parameter.named<"model"::"blk.11.attn_k.weight"> : tensor<1024x4096xf16>
util.global private @__auto.blk.11.attn_v.weight = #stream.parameter.named<"model"::"blk.11.attn_v.weight"> : tensor<1024x4096xf16>
util.global private @__auto.blk.11.attn_output.weight = #stream.parameter.named<"model"::"blk.11.attn_output.weight"> : tensor<4096x4096xf16>
util.global private @__auto.blk.11.ffn_norm.weight = #stream.parameter.named<"model"::"blk.11.ffn_norm.weight"> : tensor<4096xf32>
util.global private @__auto.blk.11.ffn_gate.weight = #stream.parameter.named<"model"::"blk.11.ffn_gate.weight"> : tensor<14336x4096xf16>
util.global private @__auto.blk.11.ffn_up.weight = #stream.parameter.named<"model"::"blk.11.ffn_up.weight"> : tensor<14336x4096xf16>
util.global private @__auto.blk.11.ffn_down.weight = #stream.parameter.named<"model"::"blk.11.ffn_down.weight"> : tensor<4096x14336xf16>
util.global private @__auto.blk.12.attn_norm.weight = #stream.parameter.named<"model"::"blk.12.attn_norm.weight"> : tensor<4096xf32>
util.global private @__auto.blk.12.attn_q.weight = #stream.parameter.named<"model"::"blk.12.attn_q.weight"> : tensor<4096x4096xf16>
util.global private @__auto.blk.12.attn_k.weight = #stream.parameter.named<"model"::"blk.12.attn_k.weight"> : tensor<1024x4096xf16>
util.global private @__auto.blk.12.attn_v.weight = #stream.parameter.named<"model"::"blk.12.attn_v.weight"> : tensor<1024x4096xf16>
util.global private @__auto.blk.12.attn_output.weight = #stream.parameter.named<"model"::"blk.12.attn_output.weight"> : tensor<4096x4096xf16>
util.global private @__auto.blk.12.ffn_norm.weight = #stream.parameter.named<"model"::"blk.12.ffn_norm.weight"> : tensor<4096xf32>
util.global private @__auto.blk.12.ffn_gate.weight = #stream.parameter.named<"model"::"blk.12.ffn_gate.weight"> : tensor<14336x4096xf16>
util.global private @__auto.blk.12.ffn_up.weight = #stream.parameter.named<"model"::"blk.12.ffn_up.weight"> : tensor<14336x4096xf16>
util.global private @__auto.blk.12.ffn_down.weight = #stream.parameter.named<"model"::"blk.12.ffn_down.weight"> : tensor<4096x14336xf16>
util.global private @__auto.blk.13.attn_norm.weight = #stream.parameter.named<"model"::"blk.13.attn_norm.weight"> : tensor<4096xf32>
util.global private @__auto.blk.13.attn_q.weight = #stream.parameter.named<"model"::"blk.13.attn_q.weight"> : tensor<4096x4096xf16>
util.global private @__auto.blk.13.attn_k.weight = #stream.parameter.named<"model"::"blk.13.attn_k.weight"> : tensor<1024x4096xf16>
util.global private @__auto.blk.13.attn_v.weight = #stream.parameter.named<"model"::"blk.13.attn_v.weight"> : tensor<1024x4096xf16>
util.global private @__auto.blk.13.attn_output.weight = #stream.parameter.named<"model"::"blk.13.attn_output.weight"> : tensor<4096x4096xf16>
util.global private @__auto.blk.13.ffn_norm.weight = #stream.parameter.named<"model"::"blk.13.ffn_norm.weight"> : tensor<4096xf32>
util.global private @__auto.blk.13.ffn_gate.weight = #stream.parameter.named<"model"::"blk.13.ffn_gate.weight"> : tensor<14336x4096xf16>
util.global private @__auto.blk.13.ffn_up.weight = #stream.parameter.named<"model"::"blk.13.ffn_up.weight"> : tensor<14336x4096xf16>
util.global private @__auto.blk.13.ffn_down.weight = #stream.parameter.named<"model"::"blk.13.ffn_down.weight"> : tensor<4096x14336xf16>
util.global private @__auto.blk.14.attn_norm.weight = #stream.parameter.named<"model"::"blk.14.attn_norm.weight"> : tensor<4096xf32>
util.global private @__auto.blk.14.attn_q.weight = #stream.parameter.named<"model"::"blk.14.attn_q.weight"> : tensor<4096x4096xf16>
util.global private @__auto.blk.14.attn_k.weight = #stream.parameter.named<"model"::"blk.14.attn_k.weight"> : tensor<1024x4096xf16>
util.global private @__auto.blk.14.attn_v.weight = #stream.parameter.named<"model"::"blk.14.attn_v.weight"> : tensor<1024x4096xf16>
util.global private @__auto.blk.14.attn_output.weight = #stream.parameter.named<"model"::"blk.14.attn_output.weight"> : tensor<4096x4096xf16>
util.global private @__auto.blk.14.ffn_norm.weight = #stream.parameter.named<"model"::"blk.14.ffn_norm.weight"> : tensor<4096xf32>
util.global private @__auto.blk.14.ffn_gate.weight = #stream.parameter.named<"model"::"blk.14.ffn_gate.weight"> : tensor<14336x4096xf16>
util.global private @__auto.blk.14.ffn_up.weight = #stream.parameter.named<"model"::"blk.14.ffn_up.weight"> : tensor<14336x4096xf16>
util.global private @__auto.blk.14.ffn_down.weight = #stream.parameter.named<"model"::"blk.14.ffn_down.weight"> : tensor<4096x14336xf16>
util.global private @__auto.blk.15.attn_norm.weight = #stream.parameter.named<"model"::"blk.15.attn_norm.weight"> : tensor<4096xf32>
util.global private @__auto.blk.15.attn_q.weight = #stream.parameter.named<"model"::"blk.15.attn_q.weight"> : tensor<4096x4096xf16>
util.global private @__auto.blk.15.attn_k.weight = #stream.parameter.named<"model"::"blk.15.attn_k.weight"> : tensor<1024x4096xf16>
util.global private @__auto.blk.15.attn_v.weight = #stream.parameter.named<"model"::"blk.15.attn_v.weight"> : tensor<1024x4096xf16>
util.global private @__auto.blk.15.attn_output.weight = #stream.parameter.named<"model"::"blk.15.attn_output.weight"> : tensor<4096x4096xf16>
util.global private @__auto.blk.15.ffn_norm.weight = #stream.parameter.named<"model"::"blk.15.ffn_norm.weight"> : tensor<4096xf32>
util.global private @__auto.blk.15.ffn_gate.weight = #stream.parameter.named<"model"::"blk.15.ffn_gate.weight"> : tensor<14336x4096xf16>
util.global private @__auto.blk.15.ffn_up.weight = #stream.parameter.named<"model"::"blk.15.ffn_up.weight"> : tensor<14336x4096xf16>
util.global private @__auto.blk.15.ffn_down.weight = #stream.parameter.named<"model"::"blk.15.ffn_down.weight"> : tensor<4096x14336xf16>
util.global private @__auto.blk.16.attn_norm.weight = #stream.parameter.named<"model"::"blk.16.attn_norm.weight"> : tensor<4096xf32>
util.global private @__auto.blk.16.attn_q.weight = #stream.parameter.named<"model"::"blk.16.attn_q.weight"> : tensor<4096x4096xf16>
util.global private @__auto.blk.16.attn_k.weight = #stream.parameter.named<"model"::"blk.16.attn_k.weight"> : tensor<1024x4096xf16>
util.global private @__auto.blk.16.attn_v.weight = #stream.parameter.named<"model"::"blk.16.attn_v.weight"> : tensor<1024x4096xf16>
util.global private @__auto.blk.16.attn_output.weight = #stream.parameter.named<"model"::"blk.16.attn_output.weight"> : tensor<4096x4096xf16>
util.global private @__auto.blk.16.ffn_norm.weight = #stream.parameter.named<"model"::"blk.16.ffn_norm.weight"> : tensor<4096xf32>
util.global private @__auto.blk.16.ffn_gate.weight = #stream.parameter.named<"model"::"blk.16.ffn_gate.weight"> : tensor<14336x4096xf16>
util.global private @__auto.blk.16.ffn_up.weight = #stream.parameter.named<"model"::"blk.16.ffn_up.weight"> : tensor<14336x4096xf16>
util.global private @__auto.blk.16.ffn_down.weight = #stream.parameter.named<"model"::"blk.16.ffn_down.weight"> : tensor<4096x14336xf16>
util.global private @__auto.blk.17.attn_norm.weight = #stream.parameter.named<"model"::"blk.17.attn_norm.weight"> : tensor<4096xf32>
util.global private @__auto.blk.17.attn_q.weight = #stream.parameter.named<"model"::"blk.17.attn_q.weight"> : tensor<4096x4096xf16>
util.global private @__auto.blk.17.attn_k.weight = #stream.parameter.named<"model"::"blk.17.attn_k.weight"> : tensor<1024x4096xf16>
util.global private @__auto.blk.17.attn_v.weight = #stream.parameter.named<"model"::"blk.17.attn_v.weight"> : tensor<1024x4096xf16>
util.global private @__auto.blk.17.attn_output.weight = #stream.parameter.named<"model"::"blk.17.attn_output.weight"> : tensor<4096x4096xf16>
util.global private @__auto.blk.17.ffn_norm.weight = #stream.parameter.named<"model"::"blk.17.ffn_norm.weight"> : tensor<4096xf32>
util.global private @__auto.blk.17.ffn_gate.weight = #stream.parameter.named<"model"::"blk.17.ffn_gate.weight"> : tensor<14336x4096xf16>
util.global private @__auto.blk.17.ffn_up.weight = #stream.parameter.named<"model"::"blk.17.ffn_up.weight"> : tensor<14336x4096xf16>
util.global private @__auto.blk.17.ffn_down.weight = #stream.parameter.named<"model"::"blk.17.ffn_down.weight"> : tensor<4096x14336xf16>
util.global private @__auto.blk.18.attn_norm.weight = #stream.parameter.named<"model"::"blk.18.attn_norm.weight"> : tensor<4096xf32>
util.global private @__auto.blk.18.attn_q.weight = #stream.parameter.named<"model"::"blk.18.attn_q.weight"> : tensor<4096x4096xf16>
util.global private @__auto.blk.18.attn_k.weight = #stream.parameter.named<"model"::"blk.18.attn_k.weight"> : tensor<1024x4096xf16>
util.global private @__auto.blk.18.attn_v.weight = #stream.parameter.named<"model"::"blk.18.attn_v.weight"> : tensor<1024x4096xf16>
util.global private @__auto.blk.18.attn_output.weight = #stream.parameter.named<"model"::"blk.18.attn_output.weight"> : tensor<4096x4096xf16>
util.global private @__auto.blk.18.ffn_norm.weight = #stream.parameter.named<"model"::"blk.18.ffn_norm.weight"> : tensor<4096xf32>
util.global private @__auto.blk.18.ffn_gate.weight = #stream.parameter.named<"model"::"blk.18.ffn_gate.weight"> : tensor<14336x4096xf16>
util.global private @__auto.blk.18.ffn_up.weight = #stream.parameter.named<"model"::"blk.18.ffn_up.weight"> : tensor<14336x4096xf16>
util.global private @__auto.blk.18.ffn_down.weight = #stream.parameter.named<"model"::"blk.18.ffn_down.weight"> : tensor<4096x14336xf16>
util.global private @__auto.blk.19.attn_norm.weight = #stream.parameter.named<"model"::"blk.19.attn_norm.weight"> : tensor<4096xf32>
util.global private @__auto.blk.19.attn_q.weight = #stream.parameter.named<"model"::"blk.19.attn_q.weight"> : tensor<4096x4096xf16>
util.global private @__auto.blk.19.attn_k.weight = #stream.parameter.named<"model"::"blk.19.attn_k.weight"> : tensor<1024x4096xf16>
util.global private @__auto.blk.19.attn_v.weight = #stream.parameter.named<"model"::"blk.19.attn_v.weight"> : tensor<1024x4096xf16>
util.global private @__auto.blk.19.attn_output.weight = #stream.parameter.named<"model"::"blk.19.attn_output.weight"> : tensor<4096x4096xf16>
util.global private @__auto.blk.19.ffn_norm.weight = #stream.parameter.named<"model"::"blk.19.ffn_norm.weight"> : tensor<4096xf32>
util.global private @__auto.blk.19.ffn_gate.weight = #stream.parameter.named<"model"::"blk.19.ffn_gate.weight"> : tensor<14336x4096xf16>
util.global private @__auto.blk.19.ffn_up.weight = #stream.parameter.named<"model"::"blk.19.ffn_up.weight"> : tensor<14336x4096xf16>
util.global private @__auto.blk.19.ffn_down.weight = #stream.parameter.named<"model"::"blk.19.ffn_down.weight"> : tensor<4096x14336xf16>
util.global private @__auto.blk.20.attn_norm.weight = #stream.parameter.named<"model"::"blk.20.attn_norm.weight"> : tensor<4096xf32>
util.global private @__auto.blk.20.attn_q.weight = #stream.parameter.named<"model"::"blk.20.attn_q.weight"> : tensor<4096x4096xf16>
util.global private @__auto.blk.20.attn_k.weight = #stream.parameter.named<"model"::"blk.20.attn_k.weight"> : tensor<1024x4096xf16>
util.global private @__auto.blk.20.attn_v.weight = #stream.parameter.named<"model"::"blk.20.attn_v.weight"> : tensor<1024x4096xf16>
util.global private @__auto.blk.20.attn_output.weight = #stream.parameter.named<"model"::"blk.20.attn_output.weight"> : tensor<4096x4096xf16>
util.global private @__auto.blk.20.ffn_norm.weight = #stream.parameter.named<"model"::"blk.20.ffn_norm.weight"> : tensor<4096xf32>
util.global private @__auto.blk.20.ffn_gate.weight = #stream.parameter.named<"model"::"blk.20.ffn_gate.weight"> : tensor<14336x4096xf16>
util.global private @__auto.blk.20.ffn_up.weight = #stream.parameter.named<"model"::"blk.20.ffn_up.weight"> : tensor<14336x4096xf16>
util.global private @__auto.blk.20.ffn_down.weight = #stream.parameter.named<"model"::"blk.20.ffn_down.weight"> : tensor<4096x14336xf16>
util.global private @__auto.blk.21.attn_norm.weight = #stream.parameter.named<"model"::"blk.21.attn_norm.weight"> : tensor<4096xf32>
util.global private @__auto.blk.21.attn_q.weight = #stream.parameter.named<"model"::"blk.21.attn_q.weight"> : tensor<4096x4096xf16>
util.global private @__auto.blk.21.attn_k.weight = #stream.parameter.named<"model"::"blk.21.attn_k.weight"> : tensor<1024x4096xf16>
util.global private @__auto.blk.21.attn_v.weight = #stream.parameter.named<"model"::"blk.21.attn_v.weight"> : tensor<1024x4096xf16>
util.global private @__auto.blk.21.attn_output.weight = #stream.parameter.named<"model"::"blk.21.attn_output.weight"> : tensor<4096x4096xf16>
util.global private @__auto.blk.21.ffn_norm.weight = #stream.parameter.named<"model"::"blk.21.ffn_norm.weight"> : tensor<4096xf32>
util.global private @__auto.blk.21.ffn_gate.weight = #stream.parameter.named<"model"::"blk.21.ffn_gate.weight"> : tensor<14336x4096xf16>
util.global private @__auto.blk.21.ffn_up.weight = #stream.parameter.named<"model"::"blk.21.ffn_up.weight"> : tensor<14336x4096xf16>
util.global private @__auto.blk.21.ffn_down.weight = #stream.parameter.named<"model"::"blk.21.ffn_down.weight"> : tensor<4096x14336xf16>
util.global private @__auto.blk.22.attn_norm.weight = #stream.parameter.named<"model"::"blk.22.attn_norm.weight"> : tensor<4096xf32>
util.global private @__auto.blk.22.attn_q.weight = #stream.parameter.named<"model"::"blk.22.attn_q.weight"> : tensor<4096x4096xf16>
util.global private @__auto.blk.22.attn_k.weight = #stream.parameter.named<"model"::"blk.22.attn_k.weight"> : tensor<1024x4096xf16>
util.global private @__auto.blk.22.attn_v.weight = #stream.parameter.named<"model"::"blk.22.attn_v.weight"> : tensor<1024x4096xf16>
util.global private @__auto.blk.22.attn_output.weight = #stream.parameter.named<"model"::"blk.22.attn_output.weight"> : tensor<4096x4096xf16>
util.global private @__auto.blk.22.ffn_norm.weight = #stream.parameter.named<"model"::"blk.22.ffn_norm.weight"> : tensor<4096xf32>
util.global private @__auto.blk.22.ffn_gate.weight = #stream.parameter.named<"model"::"blk.22.ffn_gate.weight"> : tensor<14336x4096xf16>
util.global private @__auto.blk.22.ffn_up.weight = #stream.parameter.named<"model"::"blk.22.ffn_up.weight"> : tensor<14336x4096xf16>
util.global private @__auto.blk.22.ffn_down.weight = #stream.parameter.named<"model"::"blk.22.ffn_down.weight"> : tensor<4096x14336xf16>
util.global private @__auto.blk.23.attn_norm.weight = #stream.parameter.named<"model"::"blk.23.attn_norm.weight"> : tensor<4096xf32>
util.global private @__auto.blk.23.attn_q.weight = #stream.parameter.named<"model"::"blk.23.attn_q.weight"> : tensor<4096x4096xf16>
util.global private @__auto.blk.23.attn_k.weight = #stream.parameter.named<"model"::"blk.23.attn_k.weight"> : tensor<1024x4096xf16>
util.global private @__auto.blk.23.attn_v.weight = #stream.parameter.named<"model"::"blk.23.attn_v.weight"> : tensor<1024x4096xf16>
util.global private @__auto.blk.23.attn_output.weight = #stream.parameter.named<"model"::"blk.23.attn_output.weight"> : tensor<4096x4096xf16>
util.global private @__auto.blk.23.ffn_norm.weight = #stream.parameter.named<"model"::"blk.23.ffn_norm.weight"> : tensor<4096xf32>
util.global private @__auto.blk.23.ffn_gate.weight = #stream.parameter.named<"model"::"blk.23.ffn_gate.weight"> : tensor<14336x4096xf16>
util.global private @__auto.blk.23.ffn_up.weight = #stream.parameter.named<"model"::"blk.23.ffn_up.weight"> : tensor<14336x4096xf16>
util.global private @__auto.blk.23.ffn_down.weight = #stream.parameter.named<"model"::"blk.23.ffn_down.weight"> : tensor<4096x14336xf16>
util.global private @__auto.blk.24.attn_norm.weight = #stream.parameter.named<"model"::"blk.24.attn_norm.weight"> : tensor<4096xf32>
util.global private @__auto.blk.24.attn_q.weight = #stream.parameter.named<"model"::"blk.24.attn_q.weight"> : tensor<4096x4096xf16>
util.global private @__auto.blk.24.attn_k.weight = #stream.parameter.named<"model"::"blk.24.attn_k.weight"> : tensor<1024x4096xf16>
util.global private @__auto.blk.24.attn_v.weight = #stream.parameter.named<"model"::"blk.24.attn_v.weight"> : tensor<1024x4096xf16>
util.global private @__auto.blk.24.attn_output.weight = #stream.parameter.named<"model"::"blk.24.attn_output.weight"> : tensor<4096x4096xf16>
util.global private @__auto.blk.24.ffn_norm.weight = #stream.parameter.named<"model"::"blk.24.ffn_norm.weight"> : tensor<4096xf32>
util.global private @__auto.blk.24.ffn_gate.weight = #stream.parameter.named<"model"::"blk.24.ffn_gate.weight"> : tensor<14336x4096xf16>
util.global private @__auto.blk.24.ffn_up.weight = #stream.parameter.named<"model"::"blk.24.ffn_up.weight"> : tensor<14336x4096xf16>
util.global private @__auto.blk.24.ffn_down.weight = #stream.parameter.named<"model"::"blk.24.ffn_down.weight"> : tensor<4096x14336xf16>
util.global private @__auto.blk.25.attn_norm.weight = #stream.parameter.named<"model"::"blk.25.attn_norm.weight"> : tensor<4096xf32>
util.global private @__auto.blk.25.attn_q.weight = #stream.parameter.named<"model"::"blk.25.attn_q.weight"> : tensor<4096x4096xf16>
util.global private @__auto.blk.25.attn_k.weight = #stream.parameter.named<"model"::"blk.25.attn_k.weight"> : tensor<1024x4096xf16>
util.global private @__auto.blk.25.attn_v.weight = #stream.parameter.named<"model"::"blk.25.attn_v.weight"> : tensor<1024x4096xf16>
util.global private @__auto.blk.25.attn_output.weight = #stream.parameter.named<"model"::"blk.25.attn_output.weight"> : tensor<4096x4096xf16>
util.global private @__auto.blk.25.ffn_norm.weight = #stream.parameter.named<"model"::"blk.25.ffn_norm.weight"> : tensor<4096xf32>
util.global private @__auto.blk.25.ffn_gate.weight = #stream.parameter.named<"model"::"blk.25.ffn_gate.weight"> : tensor<14336x4096xf16>
util.global private @__auto.blk.25.ffn_up.weight = #stream.parameter.named<"model"::"blk.25.ffn_up.weight"> : tensor<14336x4096xf16>
util.global private @__auto.blk.25.ffn_down.weight = #stream.parameter.named<"model"::"blk.25.ffn_down.weight"> : tensor<4096x14336xf16>
util.global private @__auto.blk.26.attn_norm.weight = #stream.parameter.named<"model"::"blk.26.attn_norm.weight"> : tensor<4096xf32>
util.global private @__auto.blk.26.attn_q.weight = #stream.parameter.named<"model"::"blk.26.attn_q.weight"> : tensor<4096x4096xf16>
util.global private @__auto.blk.26.attn_k.weight = #stream.parameter.named<"model"::"blk.26.attn_k.weight"> : tensor<1024x4096xf16>
util.global private @__auto.blk.26.attn_v.weight = #stream.parameter.named<"model"::"blk.26.attn_v.weight"> : tensor<1024x4096xf16>
util.global private @__auto.blk.26.attn_output.weight = #stream.parameter.named<"model"::"blk.26.attn_output.weight"> : tensor<4096x4096xf16>
util.global private @__auto.blk.26.ffn_norm.weight = #stream.parameter.named<"model"::"blk.26.ffn_norm.weight"> : tensor<4096xf32>
util.global private @__auto.blk.26.ffn_gate.weight = #stream.parameter.named<"model"::"blk.26.ffn_gate.weight"> : tensor<14336x4096xf16>
util.global private @__auto.blk.26.ffn_up.weight = #stream.parameter.named<"model"::"blk.26.ffn_up.weight"> : tensor<14336x4096xf16>
util.global private @__auto.blk.26.ffn_down.weight = #stream.parameter.named<"model"::"blk.26.ffn_down.weight"> : tensor<4096x14336xf16>
util.global private @__auto.blk.27.attn_norm.weight = #stream.parameter.named<"model"::"blk.27.attn_norm.weight"> : tensor<4096xf32>
util.global private @__auto.blk.27.attn_q.weight = #stream.parameter.named<"model"::"blk.27.attn_q.weight"> : tensor<4096x4096xf16>
util.global private @__auto.blk.27.attn_k.weight = #stream.parameter.named<"model"::"blk.27.attn_k.weight"> : tensor<1024x4096xf16>
util.global private @__auto.blk.27.attn_v.weight = #stream.parameter.named<"model"::"blk.27.attn_v.weight"> : tensor<1024x4096xf16>
util.global private @__auto.blk.27.attn_output.weight = #stream.parameter.named<"model"::"blk.27.attn_output.weight"> : tensor<4096x4096xf16>
util.global private @__auto.blk.27.ffn_norm.weight = #stream.parameter.named<"model"::"blk.27.ffn_norm.weight"> : tensor<4096xf32>
util.global private @__auto.blk.27.ffn_gate.weight = #stream.parameter.named<"model"::"blk.27.ffn_gate.weight"> : tensor<14336x4096xf16>
util.global private @__auto.blk.27.ffn_up.weight = #stream.parameter.named<"model"::"blk.27.ffn_up.weight"> : tensor<14336x4096xf16>
util.global private @__auto.blk.27.ffn_down.weight = #stream.parameter.named<"model"::"blk.27.ffn_down.weight"> : tensor<4096x14336xf16>
util.global private @__auto.blk.28.attn_norm.weight = #stream.parameter.named<"model"::"blk.28.attn_norm.weight"> : tensor<4096xf32>
util.global private @__auto.blk.28.attn_q.weight = #stream.parameter.named<"model"::"blk.28.attn_q.weight"> : tensor<4096x4096xf16>
util.global private @__auto.blk.28.attn_k.weight = #stream.parameter.named<"model"::"blk.28.attn_k.weight"> : tensor<1024x4096xf16>
util.global private @__auto.blk.28.attn_v.weight = #stream.parameter.named<"model"::"blk.28.attn_v.weight"> : tensor<1024x4096xf16>
util.global private @__auto.blk.28.attn_output.weight = #stream.parameter.named<"model"::"blk.28.attn_output.weight"> : tensor<4096x4096xf16>
util.global private @__auto.blk.28.ffn_norm.weight = #stream.parameter.named<"model"::"blk.28.ffn_norm.weight"> : tensor<4096xf32>
util.global private @__auto.blk.28.ffn_gate.weight = #stream.parameter.named<"model"::"blk.28.ffn_gate.weight"> : tensor<14336x4096xf16>
util.global private @__auto.blk.28.ffn_up.weight = #stream.parameter.named<"model"::"blk.28.ffn_up.weight"> : tensor<14336x4096xf16>
util.global private @__auto.blk.28.ffn_down.weight = #stream.parameter.named<"model"::"blk.28.ffn_down.weight"> : tensor<4096x14336xf16>
util.global private @__auto.blk.29.attn_norm.weight = #stream.parameter.named<"model"::"blk.29.attn_norm.weight"> : tensor<4096xf32>
util.global private @__auto.blk.29.attn_q.weight = #stream.parameter.named<"model"::"blk.29.attn_q.weight"> : tensor<4096x4096xf16>
util.global private @__auto.blk.29.attn_k.weight = #stream.parameter.named<"model"::"blk.29.attn_k.weight"> : tensor<1024x4096xf16>
util.global private @__auto.blk.29.attn_v.weight = #stream.parameter.named<"model"::"blk.29.attn_v.weight"> : tensor<1024x4096xf16>
util.global private @__auto.blk.29.attn_output.weight = #stream.parameter.named<"model"::"blk.29.attn_output.weight"> : tensor<4096x4096xf16>
util.global private @__auto.blk.29.ffn_norm.weight = #stream.parameter.named<"model"::"blk.29.ffn_norm.weight"> : tensor<4096xf32>
util.global private @__auto.blk.29.ffn_gate.weight = #stream.parameter.named<"model"::"blk.29.ffn_gate.weight"> : tensor<14336x4096xf16>
util.global private @__auto.blk.29.ffn_up.weight = #stream.parameter.named<"model"::"blk.29.ffn_up.weight"> : tensor<14336x4096xf16>
util.global private @__auto.blk.29.ffn_down.weight = #stream.parameter.named<"model"::"blk.29.ffn_down.weight"> : tensor<4096x14336xf16>
util.global private @__auto.blk.30.attn_norm.weight = #stream.parameter.named<"model"::"blk.30.attn_norm.weight"> : tensor<4096xf32>
util.global private @__auto.blk.30.attn_q.weight = #stream.parameter.named<"model"::"blk.30.attn_q.weight"> : tensor<4096x4096xf16>
util.global private @__auto.blk.30.attn_k.weight = #stream.parameter.named<"model"::"blk.30.attn_k.weight"> : tensor<1024x4096xf16>
util.global private @__auto.blk.30.attn_v.weight = #stream.parameter.named<"model"::"blk.30.attn_v.weight"> : tensor<1024x4096xf16>
util.global private @__auto.blk.30.attn_output.weight = #stream.parameter.named<"model"::"blk.30.attn_output.weight"> : tensor<4096x4096xf16>
util.global private @__auto.blk.30.ffn_norm.weight = #stream.parameter.named<"model"::"blk.30.ffn_norm.weight"> : tensor<4096xf32>
util.global private @__auto.blk.30.ffn_gate.weight = #stream.parameter.named<"model"::"blk.30.ffn_gate.weight"> : tensor<14336x4096xf16>
util.global private @__auto.blk.30.ffn_up.weight = #stream.parameter.named<"model"::"blk.30.ffn_up.weight"> : tensor<14336x4096xf16>
util.global private @__auto.blk.30.ffn_down.weight = #stream.parameter.named<"model"::"blk.30.ffn_down.weight"> : tensor<4096x14336xf16>
util.global private @__auto.blk.31.attn_norm.weight = #stream.parameter.named<"model"::"blk.31.attn_norm.weight"> : tensor<4096xf32>
util.global private @__auto.blk.31.attn_q.weight = #stream.parameter.named<"model"::"blk.31.attn_q.weight"> : tensor<4096x4096xf16>
util.global private @__auto.blk.31.attn_k.weight = #stream.parameter.named<"model"::"blk.31.attn_k.weight"> : tensor<1024x4096xf16>
util.global private @__auto.blk.31.attn_v.weight = #stream.parameter.named<"model"::"blk.31.attn_v.weight"> : tensor<1024x4096xf16>
util.global private @__auto.blk.31.attn_output.weight = #stream.parameter.named<"model"::"blk.31.attn_output.weight"> : tensor<4096x4096xf16>
util.global private @__auto.blk.31.ffn_norm.weight = #stream.parameter.named<"model"::"blk.31.ffn_norm.weight"> : tensor<4096xf32>
util.global private @__auto.blk.31.ffn_gate.weight = #stream.parameter.named<"model"::"blk.31.ffn_gate.weight"> : tensor<14336x4096xf16>
util.global private @__auto.blk.31.ffn_up.weight = #stream.parameter.named<"model"::"blk.31.ffn_up.weight"> : tensor<14336x4096xf16>
util.global private @__auto.blk.31.ffn_down.weight = #stream.parameter.named<"model"::"blk.31.ffn_down.weight"> : tensor<4096x14336xf16>
util.global private @__auto.output_norm.weight = #stream.parameter.named<"model"::"output_norm.weight"> : tensor<4096xf32>
util.global private @__auto.output.weight = #stream.parameter.named<"model"::"output.weight"> : tensor<128256x4096xf16>
util.global private @__auto.constant_1_1_8192_8192_torch.bool = dense_resource<__auto.constant_1_1_8192_8192_torch.bool> : tensor<1x1x8192x8192xi1>
func.func @prefill_bs4(%arg0: !torch.vtensor<[4,?],si64>, %arg1: !torch.vtensor<[4],si64>, %arg2: !torch.vtensor<[4,?],si64>, %arg3: !torch.tensor<[?,1048576],f32>) -> !torch.vtensor<[4,?,4096],f32> {
%__auto.token_embd.weight = util.global.load @__auto.token_embd.weight : tensor<128256x4096xf16>
%0 = torch_c.from_builtin_tensor %__auto.token_embd.weight : tensor<128256x4096xf16> -> !torch.vtensor<[128256,4096],f16>
%__auto.blk.0.attn_norm.weight = util.global.load @__auto.blk.0.attn_norm.weight : tensor<4096xf32>
%1 = torch_c.from_builtin_tensor %__auto.blk.0.attn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
%__auto.blk.0.attn_q.weight = util.global.load @__auto.blk.0.attn_q.weight : tensor<4096x4096xf16>
%2 = torch_c.from_builtin_tensor %__auto.blk.0.attn_q.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
%__auto.blk.0.attn_k.weight = util.global.load @__auto.blk.0.attn_k.weight : tensor<1024x4096xf16>
%3 = torch_c.from_builtin_tensor %__auto.blk.0.attn_k.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
%__auto.blk.0.attn_v.weight = util.global.load @__auto.blk.0.attn_v.weight : tensor<1024x4096xf16>
%4 = torch_c.from_builtin_tensor %__auto.blk.0.attn_v.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
%__auto.constant_8192_64_torch.complex64 = util.global.load @__auto.constant_8192_64_torch.complex64 : tensor<8192x64xcomplex<f32>>
%5 = torch_c.from_builtin_tensor %__auto.constant_8192_64_torch.complex64 : tensor<8192x64xcomplex<f32>> -> !torch.vtensor<[8192,64],complex<f32>>
%__auto.blk.0.attn_output.weight = util.global.load @__auto.blk.0.attn_output.weight : tensor<4096x4096xf16>
%6 = torch_c.from_builtin_tensor %__auto.blk.0.attn_output.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
%__auto.blk.0.ffn_norm.weight = util.global.load @__auto.blk.0.ffn_norm.weight : tensor<4096xf32>
%7 = torch_c.from_builtin_tensor %__auto.blk.0.ffn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
%__auto.blk.0.ffn_gate.weight = util.global.load @__auto.blk.0.ffn_gate.weight : tensor<14336x4096xf16>
%8 = torch_c.from_builtin_tensor %__auto.blk.0.ffn_gate.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
%__auto.blk.0.ffn_up.weight = util.global.load @__auto.blk.0.ffn_up.weight : tensor<14336x4096xf16>
%9 = torch_c.from_builtin_tensor %__auto.blk.0.ffn_up.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
%__auto.blk.0.ffn_down.weight = util.global.load @__auto.blk.0.ffn_down.weight : tensor<4096x14336xf16>
%10 = torch_c.from_builtin_tensor %__auto.blk.0.ffn_down.weight : tensor<4096x14336xf16> -> !torch.vtensor<[4096,14336],f16>
%__auto.blk.1.attn_norm.weight = util.global.load @__auto.blk.1.attn_norm.weight : tensor<4096xf32>
%11 = torch_c.from_builtin_tensor %__auto.blk.1.attn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
%__auto.blk.1.attn_q.weight = util.global.load @__auto.blk.1.attn_q.weight : tensor<4096x4096xf16>
%12 = torch_c.from_builtin_tensor %__auto.blk.1.attn_q.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
%__auto.blk.1.attn_k.weight = util.global.load @__auto.blk.1.attn_k.weight : tensor<1024x4096xf16>
%13 = torch_c.from_builtin_tensor %__auto.blk.1.attn_k.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
%__auto.blk.1.attn_v.weight = util.global.load @__auto.blk.1.attn_v.weight : tensor<1024x4096xf16>
%14 = torch_c.from_builtin_tensor %__auto.blk.1.attn_v.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
%__auto.blk.1.attn_output.weight = util.global.load @__auto.blk.1.attn_output.weight : tensor<4096x4096xf16>
%15 = torch_c.from_builtin_tensor %__auto.blk.1.attn_output.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
%__auto.blk.1.ffn_norm.weight = util.global.load @__auto.blk.1.ffn_norm.weight : tensor<4096xf32>
%16 = torch_c.from_builtin_tensor %__auto.blk.1.ffn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
%__auto.blk.1.ffn_gate.weight = util.global.load @__auto.blk.1.ffn_gate.weight : tensor<14336x4096xf16>
%17 = torch_c.from_builtin_tensor %__auto.blk.1.ffn_gate.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
%__auto.blk.1.ffn_up.weight = util.global.load @__auto.blk.1.ffn_up.weight : tensor<14336x4096xf16>
%18 = torch_c.from_builtin_tensor %__auto.blk.1.ffn_up.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
%__auto.blk.1.ffn_down.weight = util.global.load @__auto.blk.1.ffn_down.weight : tensor<4096x14336xf16>
%19 = torch_c.from_builtin_tensor %__auto.blk.1.ffn_down.weight : tensor<4096x14336xf16> -> !torch.vtensor<[4096,14336],f16>
%__auto.blk.2.attn_norm.weight = util.global.load @__auto.blk.2.attn_norm.weight : tensor<4096xf32>
%20 = torch_c.from_builtin_tensor %__auto.blk.2.attn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
%__auto.blk.2.attn_q.weight = util.global.load @__auto.blk.2.attn_q.weight : tensor<4096x4096xf16>
%21 = torch_c.from_builtin_tensor %__auto.blk.2.attn_q.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
%__auto.blk.2.attn_k.weight = util.global.load @__auto.blk.2.attn_k.weight : tensor<1024x4096xf16>
%22 = torch_c.from_builtin_tensor %__auto.blk.2.attn_k.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
%__auto.blk.2.attn_v.weight = util.global.load @__auto.blk.2.attn_v.weight : tensor<1024x4096xf16>
%23 = torch_c.from_builtin_tensor %__auto.blk.2.attn_v.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
%__auto.blk.2.attn_output.weight = util.global.load @__auto.blk.2.attn_output.weight : tensor<4096x4096xf16>
%24 = torch_c.from_builtin_tensor %__auto.blk.2.attn_output.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
%__auto.blk.2.ffn_norm.weight = util.global.load @__auto.blk.2.ffn_norm.weight : tensor<4096xf32>
%25 = torch_c.from_builtin_tensor %__auto.blk.2.ffn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
%__auto.blk.2.ffn_gate.weight = util.global.load @__auto.blk.2.ffn_gate.weight : tensor<14336x4096xf16>
%26 = torch_c.from_builtin_tensor %__auto.blk.2.ffn_gate.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
%__auto.blk.2.ffn_up.weight = util.global.load @__auto.blk.2.ffn_up.weight : tensor<14336x4096xf16>
%27 = torch_c.from_builtin_tensor %__auto.blk.2.ffn_up.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
%__auto.blk.2.ffn_down.weight = util.global.load @__auto.blk.2.ffn_down.weight : tensor<4096x14336xf16>
%28 = torch_c.from_builtin_tensor %__auto.blk.2.ffn_down.weight : tensor<4096x14336xf16> -> !torch.vtensor<[4096,14336],f16>
%__auto.blk.3.attn_norm.weight = util.global.load @__auto.blk.3.attn_norm.weight : tensor<4096xf32>
%29 = torch_c.from_builtin_tensor %__auto.blk.3.attn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
%__auto.blk.3.attn_q.weight = util.global.load @__auto.blk.3.attn_q.weight : tensor<4096x4096xf16>
%30 = torch_c.from_builtin_tensor %__auto.blk.3.attn_q.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
%__auto.blk.3.attn_k.weight = util.global.load @__auto.blk.3.attn_k.weight : tensor<1024x4096xf16>
%31 = torch_c.from_builtin_tensor %__auto.blk.3.attn_k.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
%__auto.blk.3.attn_v.weight = util.global.load @__auto.blk.3.attn_v.weight : tensor<1024x4096xf16>
%32 = torch_c.from_builtin_tensor %__auto.blk.3.attn_v.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
%__auto.blk.3.attn_output.weight = util.global.load @__auto.blk.3.attn_output.weight : tensor<4096x4096xf16>
%33 = torch_c.from_builtin_tensor %__auto.blk.3.attn_output.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
%__auto.blk.3.ffn_norm.weight = util.global.load @__auto.blk.3.ffn_norm.weight : tensor<4096xf32>
%34 = torch_c.from_builtin_tensor %__auto.blk.3.ffn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
%__auto.blk.3.ffn_gate.weight = util.global.load @__auto.blk.3.ffn_gate.weight : tensor<14336x4096xf16>
%35 = torch_c.from_builtin_tensor %__auto.blk.3.ffn_gate.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
%__auto.blk.3.ffn_up.weight = util.global.load @__auto.blk.3.ffn_up.weight : tensor<14336x4096xf16>
%36 = torch_c.from_builtin_tensor %__auto.blk.3.ffn_up.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
%__auto.blk.3.ffn_down.weight = util.global.load @__auto.blk.3.ffn_down.weight : tensor<4096x14336xf16>
%37 = torch_c.from_builtin_tensor %__auto.blk.3.ffn_down.weight : tensor<4096x14336xf16> -> !torch.vtensor<[4096,14336],f16>
%__auto.blk.4.attn_norm.weight = util.global.load @__auto.blk.4.attn_norm.weight : tensor<4096xf32>
%38 = torch_c.from_builtin_tensor %__auto.blk.4.attn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
%__auto.blk.4.attn_q.weight = util.global.load @__auto.blk.4.attn_q.weight : tensor<4096x4096xf16>
%39 = torch_c.from_builtin_tensor %__auto.blk.4.attn_q.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
%__auto.blk.4.attn_k.weight = util.global.load @__auto.blk.4.attn_k.weight : tensor<1024x4096xf16>
%40 = torch_c.from_builtin_tensor %__auto.blk.4.attn_k.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
%__auto.blk.4.attn_v.weight = util.global.load @__auto.blk.4.attn_v.weight : tensor<1024x4096xf16>
%41 = torch_c.from_builtin_tensor %__auto.blk.4.attn_v.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
%__auto.blk.4.attn_output.weight = util.global.load @__auto.blk.4.attn_output.weight : tensor<4096x4096xf16>
%42 = torch_c.from_builtin_tensor %__auto.blk.4.attn_output.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
%__auto.blk.4.ffn_norm.weight = util.global.load @__auto.blk.4.ffn_norm.weight : tensor<4096xf32>
%43 = torch_c.from_builtin_tensor %__auto.blk.4.ffn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
%__auto.blk.4.ffn_gate.weight = util.global.load @__auto.blk.4.ffn_gate.weight : tensor<14336x4096xf16>
%44 = torch_c.from_builtin_tensor %__auto.blk.4.ffn_gate.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
%__auto.blk.4.ffn_up.weight = util.global.load @__auto.blk.4.ffn_up.weight : tensor<14336x4096xf16>
%45 = torch_c.from_builtin_tensor %__auto.blk.4.ffn_up.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
%__auto.blk.4.ffn_down.weight = util.global.load @__auto.blk.4.ffn_down.weight : tensor<4096x14336xf16>
%46 = torch_c.from_builtin_tensor %__auto.blk.4.ffn_down.weight : tensor<4096x14336xf16> -> !torch.vtensor<[4096,14336],f16>
%__auto.blk.5.attn_norm.weight = util.global.load @__auto.blk.5.attn_norm.weight : tensor<4096xf32>
%47 = torch_c.from_builtin_tensor %__auto.blk.5.attn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
%__auto.blk.5.attn_q.weight = util.global.load @__auto.blk.5.attn_q.weight : tensor<4096x4096xf16>
%48 = torch_c.from_builtin_tensor %__auto.blk.5.attn_q.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
%__auto.blk.5.attn_k.weight = util.global.load @__auto.blk.5.attn_k.weight : tensor<1024x4096xf16>
%49 = torch_c.from_builtin_tensor %__auto.blk.5.attn_k.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
%__auto.blk.5.attn_v.weight = util.global.load @__auto.blk.5.attn_v.weight : tensor<1024x4096xf16>
%50 = torch_c.from_builtin_tensor %__auto.blk.5.attn_v.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
%__auto.blk.5.attn_output.weight = util.global.load @__auto.blk.5.attn_output.weight : tensor<4096x4096xf16>
%51 = torch_c.from_builtin_tensor %__auto.blk.5.attn_output.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
%__auto.blk.5.ffn_norm.weight = util.global.load @__auto.blk.5.ffn_norm.weight : tensor<4096xf32>
%52 = torch_c.from_builtin_tensor %__auto.blk.5.ffn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
%__auto.blk.5.ffn_gate.weight = util.global.load @__auto.blk.5.ffn_gate.weight : tensor<14336x4096xf16>
%53 = torch_c.from_builtin_tensor %__auto.blk.5.ffn_gate.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
%__auto.blk.5.ffn_up.weight = util.global.load @__auto.blk.5.ffn_up.weight : tensor<14336x4096xf16>
%54 = torch_c.from_builtin_tensor %__auto.blk.5.ffn_up.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
%__auto.blk.5.ffn_down.weight = util.global.load @__auto.blk.5.ffn_down.weight : tensor<4096x14336xf16>
%55 = torch_c.from_builtin_tensor %__auto.blk.5.ffn_down.weight : tensor<4096x14336xf16> -> !torch.vtensor<[4096,14336],f16>
%__auto.blk.6.attn_norm.weight = util.global.load @__auto.blk.6.attn_norm.weight : tensor<4096xf32>
%56 = torch_c.from_builtin_tensor %__auto.blk.6.attn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
%__auto.blk.6.attn_q.weight = util.global.load @__auto.blk.6.attn_q.weight : tensor<4096x4096xf16>
%57 = torch_c.from_builtin_tensor %__auto.blk.6.attn_q.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
%__auto.blk.6.attn_k.weight = util.global.load @__auto.blk.6.attn_k.weight : tensor<1024x4096xf16>
%58 = torch_c.from_builtin_tensor %__auto.blk.6.attn_k.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
%__auto.blk.6.attn_v.weight = util.global.load @__auto.blk.6.attn_v.weight : tensor<1024x4096xf16>
%59 = torch_c.from_builtin_tensor %__auto.blk.6.attn_v.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
%__auto.blk.6.attn_output.weight = util.global.load @__auto.blk.6.attn_output.weight : tensor<4096x4096xf16>
%60 = torch_c.from_builtin_tensor %__auto.blk.6.attn_output.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
%__auto.blk.6.ffn_norm.weight = util.global.load @__auto.blk.6.ffn_norm.weight : tensor<4096xf32>
%61 = torch_c.from_builtin_tensor %__auto.blk.6.ffn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
%__auto.blk.6.ffn_gate.weight = util.global.load @__auto.blk.6.ffn_gate.weight : tensor<14336x4096xf16>
%62 = torch_c.from_builtin_tensor %__auto.blk.6.ffn_gate.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
%__auto.blk.6.ffn_up.weight = util.global.load @__auto.blk.6.ffn_up.weight : tensor<14336x4096xf16>
%63 = torch_c.from_builtin_tensor %__auto.blk.6.ffn_up.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
%__auto.blk.6.ffn_down.weight = util.global.load @__auto.blk.6.ffn_down.weight : tensor<4096x14336xf16>
%64 = torch_c.from_builtin_tensor %__auto.blk.6.ffn_down.weight : tensor<4096x14336xf16> -> !torch.vtensor<[4096,14336],f16>
%__auto.blk.7.attn_norm.weight = util.global.load @__auto.blk.7.attn_norm.weight : tensor<4096xf32>
%65 = torch_c.from_builtin_tensor %__auto.blk.7.attn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
%__auto.blk.7.attn_q.weight = util.global.load @__auto.blk.7.attn_q.weight : tensor<4096x4096xf16>
%66 = torch_c.from_builtin_tensor %__auto.blk.7.attn_q.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
%__auto.blk.7.attn_k.weight = util.global.load @__auto.blk.7.attn_k.weight : tensor<1024x4096xf16>
%67 = torch_c.from_builtin_tensor %__auto.blk.7.attn_k.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
%__auto.blk.7.attn_v.weight = util.global.load @__auto.blk.7.attn_v.weight : tensor<1024x4096xf16>
%68 = torch_c.from_builtin_tensor %__auto.blk.7.attn_v.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
%__auto.blk.7.attn_output.weight = util.global.load @__auto.blk.7.attn_output.weight : tensor<4096x4096xf16>
%69 = torch_c.from_builtin_tensor %__auto.blk.7.attn_output.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
%__auto.blk.7.ffn_norm.weight = util.global.load @__auto.blk.7.ffn_norm.weight : tensor<4096xf32>
%70 = torch_c.from_builtin_tensor %__auto.blk.7.ffn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
%__auto.blk.7.ffn_gate.weight = util.global.load @__auto.blk.7.ffn_gate.weight : tensor<14336x4096xf16>
%71 = torch_c.from_builtin_tensor %__auto.blk.7.ffn_gate.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
%__auto.blk.7.ffn_up.weight = util.global.load @__auto.blk.7.ffn_up.weight : tensor<14336x4096xf16>
%72 = torch_c.from_builtin_tensor %__auto.blk.7.ffn_up.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
%__auto.blk.7.ffn_down.weight = util.global.load @__auto.blk.7.ffn_down.weight : tensor<4096x14336xf16>
%73 = torch_c.from_builtin_tensor %__auto.blk.7.ffn_down.weight : tensor<4096x14336xf16> -> !torch.vtensor<[4096,14336],f16>
%__auto.blk.8.attn_norm.weight = util.global.load @__auto.blk.8.attn_norm.weight : tensor<4096xf32>
%74 = torch_c.from_builtin_tensor %__auto.blk.8.attn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
%__auto.blk.8.attn_q.weight = util.global.load @__auto.blk.8.attn_q.weight : tensor<4096x4096xf16>
%75 = torch_c.from_builtin_tensor %__auto.blk.8.attn_q.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
%__auto.blk.8.attn_k.weight = util.global.load @__auto.blk.8.attn_k.weight : tensor<1024x4096xf16>
%76 = torch_c.from_builtin_tensor %__auto.blk.8.attn_k.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
%__auto.blk.8.attn_v.weight = util.global.load @__auto.blk.8.attn_v.weight : tensor<1024x4096xf16>
%77 = torch_c.from_builtin_tensor %__auto.blk.8.attn_v.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
%__auto.blk.8.attn_output.weight = util.global.load @__auto.blk.8.attn_output.weight : tensor<4096x4096xf16>
%78 = torch_c.from_builtin_tensor %__auto.blk.8.attn_output.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
%__auto.blk.8.ffn_norm.weight = util.global.load @__auto.blk.8.ffn_norm.weight : tensor<4096xf32>
%79 = torch_c.from_builtin_tensor %__auto.blk.8.ffn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
%__auto.blk.8.ffn_gate.weight = util.global.load @__auto.blk.8.ffn_gate.weight : tensor<14336x4096xf16>
%80 = torch_c.from_builtin_tensor %__auto.blk.8.ffn_gate.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
%__auto.blk.8.ffn_up.weight = util.global.load @__auto.blk.8.ffn_up.weight : tensor<14336x4096xf16>
%81 = torch_c.from_builtin_tensor %__auto.blk.8.ffn_up.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
%__auto.blk.8.ffn_down.weight = util.global.load @__auto.blk.8.ffn_down.weight : tensor<4096x14336xf16>
%82 = torch_c.from_builtin_tensor %__auto.blk.8.ffn_down.weight : tensor<4096x14336xf16> -> !torch.vtensor<[4096,14336],f16>
%__auto.blk.9.attn_norm.weight = util.global.load @__auto.blk.9.attn_norm.weight : tensor<4096xf32>
%83 = torch_c.from_builtin_tensor %__auto.blk.9.attn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
%__auto.blk.9.attn_q.weight = util.global.load @__auto.blk.9.attn_q.weight : tensor<4096x4096xf16>
%84 = torch_c.from_builtin_tensor %__auto.blk.9.attn_q.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
%__auto.blk.9.attn_k.weight = util.global.load @__auto.blk.9.attn_k.weight : tensor<1024x4096xf16>
%85 = torch_c.from_builtin_tensor %__auto.blk.9.attn_k.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
%__auto.blk.9.attn_v.weight = util.global.load @__auto.blk.9.attn_v.weight : tensor<1024x4096xf16>
%86 = torch_c.from_builtin_tensor %__auto.blk.9.attn_v.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
%__auto.blk.9.attn_output.weight = util.global.load @__auto.blk.9.attn_output.weight : tensor<4096x4096xf16>
%87 = torch_c.from_builtin_tensor %__auto.blk.9.attn_output.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
%__auto.blk.9.ffn_norm.weight = util.global.load @__auto.blk.9.ffn_norm.weight : tensor<4096xf32>
%88 = torch_c.from_builtin_tensor %__auto.blk.9.ffn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
%__auto.blk.9.ffn_gate.weight = util.global.load @__auto.blk.9.ffn_gate.weight : tensor<14336x4096xf16>
%89 = torch_c.from_builtin_tensor %__auto.blk.9.ffn_gate.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
%__auto.blk.9.ffn_up.weight = util.global.load @__auto.blk.9.ffn_up.weight : tensor<14336x4096xf16>
%90 = torch_c.from_builtin_tensor %__auto.blk.9.ffn_up.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
%__auto.blk.9.ffn_down.weight = util.global.load @__auto.blk.9.ffn_down.weight : tensor<4096x14336xf16>
%91 = torch_c.from_builtin_tensor %__auto.blk.9.ffn_down.weight : tensor<4096x14336xf16> -> !torch.vtensor<[4096,14336],f16>
%__auto.blk.10.attn_norm.weight = util.global.load @__auto.blk.10.attn_norm.weight : tensor<4096xf32>
%92 = torch_c.from_builtin_tensor %__auto.blk.10.attn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
%__auto.blk.10.attn_q.weight = util.global.load @__auto.blk.10.attn_q.weight : tensor<4096x4096xf16>
%93 = torch_c.from_builtin_tensor %__auto.blk.10.attn_q.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
%__auto.blk.10.attn_k.weight = util.global.load @__auto.blk.10.attn_k.weight : tensor<1024x4096xf16>
%94 = torch_c.from_builtin_tensor %__auto.blk.10.attn_k.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
%__auto.blk.10.attn_v.weight = util.global.load @__auto.blk.10.attn_v.weight : tensor<1024x4096xf16>
%95 = torch_c.from_builtin_tensor %__auto.blk.10.attn_v.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
%__auto.blk.10.attn_output.weight = util.global.load @__auto.blk.10.attn_output.weight : tensor<4096x4096xf16>
%96 = torch_c.from_builtin_tensor %__auto.blk.10.attn_output.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
%__auto.blk.10.ffn_norm.weight = util.global.load @__auto.blk.10.ffn_norm.weight : tensor<4096xf32>
%97 = torch_c.from_builtin_tensor %__auto.blk.10.ffn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
%__auto.blk.10.ffn_gate.weight = util.global.load @__auto.blk.10.ffn_gate.weight : tensor<14336x4096xf16>
%98 = torch_c.from_builtin_tensor %__auto.blk.10.ffn_gate.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
%__auto.blk.10.ffn_up.weight = util.global.load @__auto.blk.10.ffn_up.weight : tensor<14336x4096xf16>
%99 = torch_c.from_builtin_tensor %__auto.blk.10.ffn_up.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
%__auto.blk.10.ffn_down.weight = util.global.load @__auto.blk.10.ffn_down.weight : tensor<4096x14336xf16>
%100 = torch_c.from_builtin_tensor %__auto.blk.10.ffn_down.weight : tensor<4096x14336xf16> -> !torch.vtensor<[4096,14336],f16>
%__auto.blk.11.attn_norm.weight = util.global.load @__auto.blk.11.attn_norm.weight : tensor<4096xf32>
%101 = torch_c.from_builtin_tensor %__auto.blk.11.attn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
%__auto.blk.11.attn_q.weight = util.global.load @__auto.blk.11.attn_q.weight : tensor<4096x4096xf16>
%102 = torch_c.from_builtin_tensor %__auto.blk.11.attn_q.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
%__auto.blk.11.attn_k.weight = util.global.load @__auto.blk.11.attn_k.weight : tensor<1024x4096xf16>
%103 = torch_c.from_builtin_tensor %__auto.blk.11.attn_k.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
%__auto.blk.11.attn_v.weight = util.global.load @__auto.blk.11.attn_v.weight : tensor<1024x4096xf16>
%104 = torch_c.from_builtin_tensor %__auto.blk.11.attn_v.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
%__auto.blk.11.attn_output.weight = util.global.load @__auto.blk.11.attn_output.weight : tensor<4096x4096xf16>
%105 = torch_c.from_builtin_tensor %__auto.blk.11.attn_output.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
%__auto.blk.11.ffn_norm.weight = util.global.load @__auto.blk.11.ffn_norm.weight : tensor<4096xf32>
%106 = torch_c.from_builtin_tensor %__auto.blk.11.ffn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
%__auto.blk.11.ffn_gate.weight = util.global.load @__auto.blk.11.ffn_gate.weight : tensor<14336x4096xf16>
%107 = torch_c.from_builtin_tensor %__auto.blk.11.ffn_gate.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
%__auto.blk.11.ffn_up.weight = util.global.load @__auto.blk.11.ffn_up.weight : tensor<14336x4096xf16>
%108 = torch_c.from_builtin_tensor %__auto.blk.11.ffn_up.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
%__auto.blk.11.ffn_down.weight = util.global.load @__auto.blk.11.ffn_down.weight : tensor<4096x14336xf16>
%109 = torch_c.from_builtin_tensor %__auto.blk.11.ffn_down.weight : tensor<4096x14336xf16> -> !torch.vtensor<[4096,14336],f16>
%__auto.blk.12.attn_norm.weight = util.global.load @__auto.blk.12.attn_norm.weight : tensor<4096xf32>
%110 = torch_c.from_builtin_tensor %__auto.blk.12.attn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
%__auto.blk.12.attn_q.weight = util.global.load @__auto.blk.12.attn_q.weight : tensor<4096x4096xf16>
%111 = torch_c.from_builtin_tensor %__auto.blk.12.attn_q.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
%__auto.blk.12.attn_k.weight = util.global.load @__auto.blk.12.attn_k.weight : tensor<1024x4096xf16>
%112 = torch_c.from_builtin_tensor %__auto.blk.12.attn_k.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
%__auto.blk.12.attn_v.weight = util.global.load @__auto.blk.12.attn_v.weight : tensor<1024x4096xf16>
%113 = torch_c.from_builtin_tensor %__auto.blk.12.attn_v.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
%__auto.blk.12.attn_output.weight = util.global.load @__auto.blk.12.attn_output.weight : tensor<4096x4096xf16>
%114 = torch_c.from_builtin_tensor %__auto.blk.12.attn_output.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
%__auto.blk.12.ffn_norm.weight = util.global.load @__auto.blk.12.ffn_norm.weight : tensor<4096xf32>
%115 = torch_c.from_builtin_tensor %__auto.blk.12.ffn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
%__auto.blk.12.ffn_gate.weight = util.global.load @__auto.blk.12.ffn_gate.weight : tensor<14336x4096xf16>
%116 = torch_c.from_builtin_tensor %__auto.blk.12.ffn_gate.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
%__auto.blk.12.ffn_up.weight = util.global.load @__auto.blk.12.ffn_up.weight : tensor<14336x4096xf16>
%117 = torch_c.from_builtin_tensor %__auto.blk.12.ffn_up.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
%__auto.blk.12.ffn_down.weight = util.global.load @__auto.blk.12.ffn_down.weight : tensor<4096x14336xf16>
%118 = torch_c.from_builtin_tensor %__auto.blk.12.ffn_down.weight : tensor<4096x14336xf16> -> !torch.vtensor<[4096,14336],f16>
%__auto.blk.13.attn_norm.weight = util.global.load @__auto.blk.13.attn_norm.weight : tensor<4096xf32>
%119 = torch_c.from_builtin_tensor %__auto.blk.13.attn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
%__auto.blk.13.attn_q.weight = util.global.load @__auto.blk.13.attn_q.weight : tensor<4096x4096xf16>
%120 = torch_c.from_builtin_tensor %__auto.blk.13.attn_q.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
%__auto.blk.13.attn_k.weight = util.global.load @__auto.blk.13.attn_k.weight : tensor<1024x4096xf16>
%121 = torch_c.from_builtin_tensor %__auto.blk.13.attn_k.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
%__auto.blk.13.attn_v.weight = util.global.load @__auto.blk.13.attn_v.weight : tensor<1024x4096xf16>
%122 = torch_c.from_builtin_tensor %__auto.blk.13.attn_v.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
%__auto.blk.13.attn_output.weight = util.global.load @__auto.blk.13.attn_output.weight : tensor<4096x4096xf16>
%123 = torch_c.from_builtin_tensor %__auto.blk.13.attn_output.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
%__auto.blk.13.ffn_norm.weight = util.global.load @__auto.blk.13.ffn_norm.weight : tensor<4096xf32>
%124 = torch_c.from_builtin_tensor %__auto.blk.13.ffn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
%__auto.blk.13.ffn_gate.weight = util.global.load @__auto.blk.13.ffn_gate.weight : tensor<14336x4096xf16>
%125 = torch_c.from_builtin_tensor %__auto.blk.13.ffn_gate.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
%__auto.blk.13.ffn_up.weight = util.global.load @__auto.blk.13.ffn_up.weight : tensor<14336x4096xf16>
%126 = torch_c.from_builtin_tensor %__auto.blk.13.ffn_up.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
%__auto.blk.13.ffn_down.weight = util.global.load @__auto.blk.13.ffn_down.weight : tensor<4096x14336xf16>
%127 = torch_c.from_builtin_tensor %__auto.blk.13.ffn_down.weight : tensor<4096x14336xf16> -> !torch.vtensor<[4096,14336],f16>
%__auto.blk.14.attn_norm.weight = util.global.load @__auto.blk.14.attn_norm.weight : tensor<4096xf32>
%128 = torch_c.from_builtin_tensor %__auto.blk.14.attn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
%__auto.blk.14.attn_q.weight = util.global.load @__auto.blk.14.attn_q.weight : tensor<4096x4096xf16>
%129 = torch_c.from_builtin_tensor %__auto.blk.14.attn_q.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
%__auto.blk.14.attn_k.weight = util.global.load @__auto.blk.14.attn_k.weight : tensor<1024x4096xf16>
%130 = torch_c.from_builtin_tensor %__auto.blk.14.attn_k.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
%__auto.blk.14.attn_v.weight = util.global.load @__auto.blk.14.attn_v.weight : tensor<1024x4096xf16>
%131 = torch_c.from_builtin_tensor %__auto.blk.14.attn_v.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
%__auto.blk.14.attn_output.weight = util.global.load @__auto.blk.14.attn_output.weight : tensor<4096x4096xf16>
%132 = torch_c.from_builtin_tensor %__auto.blk.14.attn_output.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
%__auto.blk.14.ffn_norm.weight = util.global.load @__auto.blk.14.ffn_norm.weight : tensor<4096xf32>
%133 = torch_c.from_builtin_tensor %__auto.blk.14.ffn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
%__auto.blk.14.ffn_gate.weight = util.global.load @__auto.blk.14.ffn_gate.weight : tensor<14336x4096xf16>
%134 = torch_c.from_builtin_tensor %__auto.blk.14.ffn_gate.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
%__auto.blk.14.ffn_up.weight = util.global.load @__auto.blk.14.ffn_up.weight : tensor<14336x4096xf16>
%135 = torch_c.from_builtin_tensor %__auto.blk.14.ffn_up.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
%__auto.blk.14.ffn_down.weight = util.global.load @__auto.blk.14.ffn_down.weight : tensor<4096x14336xf16>
%136 = torch_c.from_builtin_tensor %__auto.blk.14.ffn_down.weight : tensor<4096x14336xf16> -> !torch.vtensor<[4096,14336],f16>
%__auto.blk.15.attn_norm.weight = util.global.load @__auto.blk.15.attn_norm.weight : tensor<4096xf32>
%137 = torch_c.from_builtin_tensor %__auto.blk.15.attn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
%__auto.blk.15.attn_q.weight = util.global.load @__auto.blk.15.attn_q.weight : tensor<4096x4096xf16>
%138 = torch_c.from_builtin_tensor %__auto.blk.15.attn_q.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
%__auto.blk.15.attn_k.weight = util.global.load @__auto.blk.15.attn_k.weight : tensor<1024x4096xf16>
%139 = torch_c.from_builtin_tensor %__auto.blk.15.attn_k.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
%__auto.blk.15.attn_v.weight = util.global.load @__auto.blk.15.attn_v.weight : tensor<1024x4096xf16>
%140 = torch_c.from_builtin_tensor %__auto.blk.15.attn_v.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
%__auto.blk.15.attn_output.weight = util.global.load @__auto.blk.15.attn_output.weight : tensor<4096x4096xf16>
%141 = torch_c.from_builtin_tensor %__auto.blk.15.attn_output.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
%__auto.blk.15.ffn_norm.weight = util.global.load @__auto.blk.15.ffn_norm.weight : tensor<4096xf32>
%142 = torch_c.from_builtin_tensor %__auto.blk.15.ffn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
%__auto.blk.15.ffn_gate.weight = util.global.load @__auto.blk.15.ffn_gate.weight : tensor<14336x4096xf16>
%143 = torch_c.from_builtin_tensor %__auto.blk.15.ffn_gate.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
%__auto.blk.15.ffn_up.weight = util.global.load @__auto.blk.15.ffn_up.weight : tensor<14336x4096xf16>
%144 = torch_c.from_builtin_tensor %__auto.blk.15.ffn_up.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
%__auto.blk.15.ffn_down.weight = util.global.load @__auto.blk.15.ffn_down.weight : tensor<4096x14336xf16>
%145 = torch_c.from_builtin_tensor %__auto.blk.15.ffn_down.weight : tensor<4096x14336xf16> -> !torch.vtensor<[4096,14336],f16>
%__auto.blk.16.attn_norm.weight = util.global.load @__auto.blk.16.attn_norm.weight : tensor<4096xf32>
%146 = torch_c.from_builtin_tensor %__auto.blk.16.attn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
%__auto.blk.16.attn_q.weight = util.global.load @__auto.blk.16.attn_q.weight : tensor<4096x4096xf16>
%147 = torch_c.from_builtin_tensor %__auto.blk.16.attn_q.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
%__auto.blk.16.attn_k.weight = util.global.load @__auto.blk.16.attn_k.weight : tensor<1024x4096xf16>
%148 = torch_c.from_builtin_tensor %__auto.blk.16.attn_k.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
%__auto.blk.16.attn_v.weight = util.global.load @__auto.blk.16.attn_v.weight : tensor<1024x4096xf16>
%149 = torch_c.from_builtin_tensor %__auto.blk.16.attn_v.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
%__auto.blk.16.attn_output.weight = util.global.load @__auto.blk.16.attn_output.weight : tensor<4096x4096xf16>
%150 = torch_c.from_builtin_tensor %__auto.blk.16.attn_output.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
%__auto.blk.16.ffn_norm.weight = util.global.load @__auto.blk.16.ffn_norm.weight : tensor<4096xf32>
%151 = torch_c.from_builtin_tensor %__auto.blk.16.ffn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
%__auto.blk.16.ffn_gate.weight = util.global.load @__auto.blk.16.ffn_gate.weight : tensor<14336x4096xf16>
%152 = torch_c.from_builtin_tensor %__auto.blk.16.ffn_gate.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
%__auto.blk.16.ffn_up.weight = util.global.load @__auto.blk.16.ffn_up.weight : tensor<14336x4096xf16>
%153 = torch_c.from_builtin_tensor %__auto.blk.16.ffn_up.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
%__auto.blk.16.ffn_down.weight = util.global.load @__auto.blk.16.ffn_down.weight : tensor<4096x14336xf16>
%154 = torch_c.from_builtin_tensor %__auto.blk.16.ffn_down.weight : tensor<4096x14336xf16> -> !torch.vtensor<[4096,14336],f16>
%__auto.blk.17.attn_norm.weight = util.global.load @__auto.blk.17.attn_norm.weight : tensor<4096xf32>
%155 = torch_c.from_builtin_tensor %__auto.blk.17.attn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
%__auto.blk.17.attn_q.weight = util.global.load @__auto.blk.17.attn_q.weight : tensor<4096x4096xf16>
%156 = torch_c.from_builtin_tensor %__auto.blk.17.attn_q.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
%__auto.blk.17.attn_k.weight = util.global.load @__auto.blk.17.attn_k.weight : tensor<1024x4096xf16>
%157 = torch_c.from_builtin_tensor %__auto.blk.17.attn_k.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
%__auto.blk.17.attn_v.weight = util.global.load @__auto.blk.17.attn_v.weight : tensor<1024x4096xf16>
%158 = torch_c.from_builtin_tensor %__auto.blk.17.attn_v.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
%__auto.blk.17.attn_output.weight = util.global.load @__auto.blk.17.attn_output.weight : tensor<4096x4096xf16>
%159 = torch_c.from_builtin_tensor %__auto.blk.17.attn_output.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
%__auto.blk.17.ffn_norm.weight = util.global.load @__auto.blk.17.ffn_norm.weight : tensor<4096xf32>
%160 = torch_c.from_builtin_tensor %__auto.blk.17.ffn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
%__auto.blk.17.ffn_gate.weight = util.global.load @__auto.blk.17.ffn_gate.weight : tensor<14336x4096xf16>
%161 = torch_c.from_builtin_tensor %__auto.blk.17.ffn_gate.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
%__auto.blk.17.ffn_up.weight = util.global.load @__auto.blk.17.ffn_up.weight : tensor<14336x4096xf16>
%162 = torch_c.from_builtin_tensor %__auto.blk.17.ffn_up.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
%__auto.blk.17.ffn_down.weight = util.global.load @__auto.blk.17.ffn_down.weight : tensor<4096x14336xf16>
%163 = torch_c.from_builtin_tensor %__auto.blk.17.ffn_down.weight : tensor<4096x14336xf16> -> !torch.vtensor<[4096,14336],f16>
%__auto.blk.18.attn_norm.weight = util.global.load @__auto.blk.18.attn_norm.weight : tensor<4096xf32>
%164 = torch_c.from_builtin_tensor %__auto.blk.18.attn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
%__auto.blk.18.attn_q.weight = util.global.load @__auto.blk.18.attn_q.weight : tensor<4096x4096xf16>
%165 = torch_c.from_builtin_tensor %__auto.blk.18.attn_q.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
%__auto.blk.18.attn_k.weight = util.global.load @__auto.blk.18.attn_k.weight : tensor<1024x4096xf16>
%166 = torch_c.from_builtin_tensor %__auto.blk.18.attn_k.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
%__auto.blk.18.attn_v.weight = util.global.load @__auto.blk.18.attn_v.weight : tensor<1024x4096xf16>
%167 = torch_c.from_builtin_tensor %__auto.blk.18.attn_v.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
%__auto.blk.18.attn_output.weight = util.global.load @__auto.blk.18.attn_output.weight : tensor<4096x4096xf16>
%168 = torch_c.from_builtin_tensor %__auto.blk.18.attn_output.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
%__auto.blk.18.ffn_norm.weight = util.global.load @__auto.blk.18.ffn_norm.weight : tensor<4096xf32>
%169 = torch_c.from_builtin_tensor %__auto.blk.18.ffn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
%__auto.blk.18.ffn_gate.weight = util.global.load @__auto.blk.18.ffn_gate.weight : tensor<14336x4096xf16>
%170 = torch_c.from_builtin_tensor %__auto.blk.18.ffn_gate.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
%__auto.blk.18.ffn_up.weight = util.global.load @__auto.blk.18.ffn_up.weight : tensor<14336x4096xf16>
%171 = torch_c.from_builtin_tensor %__auto.blk.18.ffn_up.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
%__auto.blk.18.ffn_down.weight = util.global.load @__auto.blk.18.ffn_down.weight : tensor<4096x14336xf16>
%172 = torch_c.from_builtin_tensor %__auto.blk.18.ffn_down.weight : tensor<4096x14336xf16> -> !torch.vtensor<[4096,14336],f16>
%__auto.blk.19.attn_norm.weight = util.global.load @__auto.blk.19.attn_norm.weight : tensor<4096xf32>
%173 = torch_c.from_builtin_tensor %__auto.blk.19.attn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
%__auto.blk.19.attn_q.weight = util.global.load @__auto.blk.19.attn_q.weight : tensor<4096x4096xf16>
%174 = torch_c.from_builtin_tensor %__auto.blk.19.attn_q.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
%__auto.blk.19.attn_k.weight = util.global.load @__auto.blk.19.attn_k.weight : tensor<1024x4096xf16>
%175 = torch_c.from_builtin_tensor %__auto.blk.19.attn_k.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
%__auto.blk.19.attn_v.weight = util.global.load @__auto.blk.19.attn_v.weight : tensor<1024x4096xf16>
%176 = torch_c.from_builtin_tensor %__auto.blk.19.attn_v.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
%__auto.blk.19.attn_output.weight = util.global.load @__auto.blk.19.attn_output.weight : tensor<4096x4096xf16>
%177 = torch_c.from_builtin_tensor %__auto.blk.19.attn_output.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
%__auto.blk.19.ffn_norm.weight = util.global.load @__auto.blk.19.ffn_norm.weight : tensor<4096xf32>
%178 = torch_c.from_builtin_tensor %__auto.blk.19.ffn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
%__auto.blk.19.ffn_gate.weight = util.global.load @__auto.blk.19.ffn_gate.weight : tensor<14336x4096xf16>
%179 = torch_c.from_builtin_tensor %__auto.blk.19.ffn_gate.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
%__auto.blk.19.ffn_up.weight = util.global.load @__auto.blk.19.ffn_up.weight : tensor<14336x4096xf16>
%180 = torch_c.from_builtin_tensor %__auto.blk.19.ffn_up.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
%__auto.blk.19.ffn_down.weight = util.global.load @__auto.blk.19.ffn_down.weight : tensor<4096x14336xf16>
%181 = torch_c.from_builtin_tensor %__auto.blk.19.ffn_down.weight : tensor<4096x14336xf16> -> !torch.vtensor<[4096,14336],f16>
%__auto.blk.20.attn_norm.weight = util.global.load @__auto.blk.20.attn_norm.weight : tensor<4096xf32>
%182 = torch_c.from_builtin_tensor %__auto.blk.20.attn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
%__auto.blk.20.attn_q.weight = util.global.load @__auto.blk.20.attn_q.weight : tensor<4096x4096xf16>
%183 = torch_c.from_builtin_tensor %__auto.blk.20.attn_q.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
%__auto.blk.20.attn_k.weight = util.global.load @__auto.blk.20.attn_k.weight : tensor<1024x4096xf16>
%184 = torch_c.from_builtin_tensor %__auto.blk.20.attn_k.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
%__auto.blk.20.attn_v.weight = util.global.load @__auto.blk.20.attn_v.weight : tensor<1024x4096xf16>
%185 = torch_c.from_builtin_tensor %__auto.blk.20.attn_v.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
%__auto.blk.20.attn_output.weight = util.global.load @__auto.blk.20.attn_output.weight : tensor<4096x4096xf16>
%186 = torch_c.from_builtin_tensor %__auto.blk.20.attn_output.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
%__auto.blk.20.ffn_norm.weight = util.global.load @__auto.blk.20.ffn_norm.weight : tensor<4096xf32>
%187 = torch_c.from_builtin_tensor %__auto.blk.20.ffn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
%__auto.blk.20.ffn_gate.weight = util.global.load @__auto.blk.20.ffn_gate.weight : tensor<14336x4096xf16>
%188 = torch_c.from_builtin_tensor %__auto.blk.20.ffn_gate.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
%__auto.blk.20.ffn_up.weight = util.global.load @__auto.blk.20.ffn_up.weight : tensor<14336x4096xf16>
%189 = torch_c.from_builtin_tensor %__auto.blk.20.ffn_up.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
%__auto.blk.20.ffn_down.weight = util.global.load @__auto.blk.20.ffn_down.weight : tensor<4096x14336xf16>
%190 = torch_c.from_builtin_tensor %__auto.blk.20.ffn_down.weight : tensor<4096x14336xf16> -> !torch.vtensor<[4096,14336],f16>
%__auto.blk.21.attn_norm.weight = util.global.load @__auto.blk.21.attn_norm.weight : tensor<4096xf32>
%191 = torch_c.from_builtin_tensor %__auto.blk.21.attn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
%__auto.blk.21.attn_q.weight = util.global.load @__auto.blk.21.attn_q.weight : tensor<4096x4096xf16>
%192 = torch_c.from_builtin_tensor %__auto.blk.21.attn_q.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
%__auto.blk.21.attn_k.weight = util.global.load @__auto.blk.21.attn_k.weight : tensor<1024x4096xf16>
%193 = torch_c.from_builtin_tensor %__auto.blk.21.attn_k.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
%__auto.blk.21.attn_v.weight = util.global.load @__auto.blk.21.attn_v.weight : tensor<1024x4096xf16>
%194 = torch_c.from_builtin_tensor %__auto.blk.21.attn_v.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
%__auto.blk.21.attn_output.weight = util.global.load @__auto.blk.21.attn_output.weight : tensor<4096x4096xf16>
%195 = torch_c.from_builtin_tensor %__auto.blk.21.attn_output.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
%__auto.blk.21.ffn_norm.weight = util.global.load @__auto.blk.21.ffn_norm.weight : tensor<4096xf32>
%196 = torch_c.from_builtin_tensor %__auto.blk.21.ffn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
%__auto.blk.21.ffn_gate.weight = util.global.load @__auto.blk.21.ffn_gate.weight : tensor<14336x4096xf16>
%197 = torch_c.from_builtin_tensor %__auto.blk.21.ffn_gate.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
%__auto.blk.21.ffn_up.weight = util.global.load @__auto.blk.21.ffn_up.weight : tensor<14336x4096xf16>
%198 = torch_c.from_builtin_tensor %__auto.blk.21.ffn_up.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
%__auto.blk.21.ffn_down.weight = util.global.load @__auto.blk.21.ffn_down.weight : tensor<4096x14336xf16>
%199 = torch_c.from_builtin_tensor %__auto.blk.21.ffn_down.weight : tensor<4096x14336xf16> -> !torch.vtensor<[4096,14336],f16>
%__auto.blk.22.attn_norm.weight = util.global.load @__auto.blk.22.attn_norm.weight : tensor<4096xf32>
%200 = torch_c.from_builtin_tensor %__auto.blk.22.attn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
%__auto.blk.22.attn_q.weight = util.global.load @__auto.blk.22.attn_q.weight : tensor<4096x4096xf16>
%201 = torch_c.from_builtin_tensor %__auto.blk.22.attn_q.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
%__auto.blk.22.attn_k.weight = util.global.load @__auto.blk.22.attn_k.weight : tensor<1024x4096xf16>
%202 = torch_c.from_builtin_tensor %__auto.blk.22.attn_k.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
%__auto.blk.22.attn_v.weight = util.global.load @__auto.blk.22.attn_v.weight : tensor<1024x4096xf16>
%203 = torch_c.from_builtin_tensor %__auto.blk.22.attn_v.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
%__auto.blk.22.attn_output.weight = util.global.load @__auto.blk.22.attn_output.weight : tensor<4096x4096xf16>
%204 = torch_c.from_builtin_tensor %__auto.blk.22.attn_output.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
%__auto.blk.22.ffn_norm.weight = util.global.load @__auto.blk.22.ffn_norm.weight : tensor<4096xf32>
%205 = torch_c.from_builtin_tensor %__auto.blk.22.ffn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
%__auto.blk.22.ffn_gate.weight = util.global.load @__auto.blk.22.ffn_gate.weight : tensor<14336x4096xf16>
%206 = torch_c.from_builtin_tensor %__auto.blk.22.ffn_gate.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
%__auto.blk.22.ffn_up.weight = util.global.load @__auto.blk.22.ffn_up.weight : tensor<14336x4096xf16>
%207 = torch_c.from_builtin_tensor %__auto.blk.22.ffn_up.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
%__auto.blk.22.ffn_down.weight = util.global.load @__auto.blk.22.ffn_down.weight : tensor<4096x14336xf16>
%208 = torch_c.from_builtin_tensor %__auto.blk.22.ffn_down.weight : tensor<4096x14336xf16> -> !torch.vtensor<[4096,14336],f16>
%__auto.blk.23.attn_norm.weight = util.global.load @__auto.blk.23.attn_norm.weight : tensor<4096xf32>
%209 = torch_c.from_builtin_tensor %__auto.blk.23.attn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
%__auto.blk.23.attn_q.weight = util.global.load @__auto.blk.23.attn_q.weight : tensor<4096x4096xf16>
%210 = torch_c.from_builtin_tensor %__auto.blk.23.attn_q.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
%__auto.blk.23.attn_k.weight = util.global.load @__auto.blk.23.attn_k.weight : tensor<1024x4096xf16>
%211 = torch_c.from_builtin_tensor %__auto.blk.23.attn_k.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
%__auto.blk.23.attn_v.weight = util.global.load @__auto.blk.23.attn_v.weight : tensor<1024x4096xf16>
%212 = torch_c.from_builtin_tensor %__auto.blk.23.attn_v.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
%__auto.blk.23.attn_output.weight = util.global.load @__auto.blk.23.attn_output.weight : tensor<4096x4096xf16>
%213 = torch_c.from_builtin_tensor %__auto.blk.23.attn_output.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
%__auto.blk.23.ffn_norm.weight = util.global.load @__auto.blk.23.ffn_norm.weight : tensor<4096xf32>
%214 = torch_c.from_builtin_tensor %__auto.blk.23.ffn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
%__auto.blk.23.ffn_gate.weight = util.global.load @__auto.blk.23.ffn_gate.weight : tensor<14336x4096xf16>
%215 = torch_c.from_builtin_tensor %__auto.blk.23.ffn_gate.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
%__auto.blk.23.ffn_up.weight = util.global.load @__auto.blk.23.ffn_up.weight : tensor<14336x4096xf16>
%216 = torch_c.from_builtin_tensor %__auto.blk.23.ffn_up.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
%__auto.blk.23.ffn_down.weight = util.global.load @__auto.blk.23.ffn_down.weight : tensor<4096x14336xf16>
%217 = torch_c.from_builtin_tensor %__auto.blk.23.ffn_down.weight : tensor<4096x14336xf16> -> !torch.vtensor<[4096,14336],f16>
%__auto.blk.24.attn_norm.weight = util.global.load @__auto.blk.24.attn_norm.weight : tensor<4096xf32>
%218 = torch_c.from_builtin_tensor %__auto.blk.24.attn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
%__auto.blk.24.attn_q.weight = util.global.load @__auto.blk.24.attn_q.weight : tensor<4096x4096xf16>
%219 = torch_c.from_builtin_tensor %__auto.blk.24.attn_q.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
%__auto.blk.24.attn_k.weight = util.global.load @__auto.blk.24.attn_k.weight : tensor<1024x4096xf16>
%220 = torch_c.from_builtin_tensor %__auto.blk.24.attn_k.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
%__auto.blk.24.attn_v.weight = util.global.load @__auto.blk.24.attn_v.weight : tensor<1024x4096xf16>
%221 = torch_c.from_builtin_tensor %__auto.blk.24.attn_v.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
%__auto.blk.24.attn_output.weight = util.global.load @__auto.blk.24.attn_output.weight : tensor<4096x4096xf16>
%222 = torch_c.from_builtin_tensor %__auto.blk.24.attn_output.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
%__auto.blk.24.ffn_norm.weight = util.global.load @__auto.blk.24.ffn_norm.weight : tensor<4096xf32>
%223 = torch_c.from_builtin_tensor %__auto.blk.24.ffn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
%__auto.blk.24.ffn_gate.weight = util.global.load @__auto.blk.24.ffn_gate.weight : tensor<14336x4096xf16>
%224 = torch_c.from_builtin_tensor %__auto.blk.24.ffn_gate.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
%__auto.blk.24.ffn_up.weight = util.global.load @__auto.blk.24.ffn_up.weight : tensor<14336x4096xf16>
%225 = torch_c.from_builtin_tensor %__auto.blk.24.ffn_up.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
%__auto.blk.24.ffn_down.weight = util.global.load @__auto.blk.24.ffn_down.weight : tensor<4096x14336xf16>
%226 = torch_c.from_builtin_tensor %__auto.blk.24.ffn_down.weight : tensor<4096x14336xf16> -> !torch.vtensor<[4096,14336],f16>
%__auto.blk.25.attn_norm.weight = util.global.load @__auto.blk.25.attn_norm.weight : tensor<4096xf32>
%227 = torch_c.from_builtin_tensor %__auto.blk.25.attn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
%__auto.blk.25.attn_q.weight = util.global.load @__auto.blk.25.attn_q.weight : tensor<4096x4096xf16>
%228 = torch_c.from_builtin_tensor %__auto.blk.25.attn_q.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
%__auto.blk.25.attn_k.weight = util.global.load @__auto.blk.25.attn_k.weight : tensor<1024x4096xf16>
%229 = torch_c.from_builtin_tensor %__auto.blk.25.attn_k.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
%__auto.blk.25.attn_v.weight = util.global.load @__auto.blk.25.attn_v.weight : tensor<1024x4096xf16>
%230 = torch_c.from_builtin_tensor %__auto.blk.25.attn_v.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
%__auto.blk.25.attn_output.weight = util.global.load @__auto.blk.25.attn_output.weight : tensor<4096x4096xf16>
%231 = torch_c.from_builtin_tensor %__auto.blk.25.attn_output.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
%__auto.blk.25.ffn_norm.weight = util.global.load @__auto.blk.25.ffn_norm.weight : tensor<4096xf32>
%232 = torch_c.from_builtin_tensor %__auto.blk.25.ffn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
%__auto.blk.25.ffn_gate.weight = util.global.load @__auto.blk.25.ffn_gate.weight : tensor<14336x4096xf16>
%233 = torch_c.from_builtin_tensor %__auto.blk.25.ffn_gate.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
%__auto.blk.25.ffn_up.weight = util.global.load @__auto.blk.25.ffn_up.weight : tensor<14336x4096xf16>
%234 = torch_c.from_builtin_tensor %__auto.blk.25.ffn_up.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
%__auto.blk.25.ffn_down.weight = util.global.load @__auto.blk.25.ffn_down.weight : tensor<4096x14336xf16>
%235 = torch_c.from_builtin_tensor %__auto.blk.25.ffn_down.weight : tensor<4096x14336xf16> -> !torch.vtensor<[4096,14336],f16>
%__auto.blk.26.attn_norm.weight = util.global.load @__auto.blk.26.attn_norm.weight : tensor<4096xf32>
%236 = torch_c.from_builtin_tensor %__auto.blk.26.attn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
%__auto.blk.26.attn_q.weight = util.global.load @__auto.blk.26.attn_q.weight : tensor<4096x4096xf16>
%237 = torch_c.from_builtin_tensor %__auto.blk.26.attn_q.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
%__auto.blk.26.attn_k.weight = util.global.load @__auto.blk.26.attn_k.weight : tensor<1024x4096xf16>
%238 = torch_c.from_builtin_tensor %__auto.blk.26.attn_k.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
%__auto.blk.26.attn_v.weight = util.global.load @__auto.blk.26.attn_v.weight : tensor<1024x4096xf16>
%239 = torch_c.from_builtin_tensor %__auto.blk.26.attn_v.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
%__auto.blk.26.attn_output.weight = util.global.load @__auto.blk.26.attn_output.weight : tensor<4096x4096xf16>
%240 = torch_c.from_builtin_tensor %__auto.blk.26.attn_output.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
%__auto.blk.26.ffn_norm.weight = util.global.load @__auto.blk.26.ffn_norm.weight : tensor<4096xf32>
%241 = torch_c.from_builtin_tensor %__auto.blk.26.ffn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
%__auto.blk.26.ffn_gate.weight = util.global.load @__auto.blk.26.ffn_gate.weight : tensor<14336x4096xf16>
%242 = torch_c.from_builtin_tensor %__auto.blk.26.ffn_gate.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
%__auto.blk.26.ffn_up.weight = util.global.load @__auto.blk.26.ffn_up.weight : tensor<14336x4096xf16>
%243 = torch_c.from_builtin_tensor %__auto.blk.26.ffn_up.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
%__auto.blk.26.ffn_down.weight = util.global.load @__auto.blk.26.ffn_down.weight : tensor<4096x14336xf16>
%244 = torch_c.from_builtin_tensor %__auto.blk.26.ffn_down.weight : tensor<4096x14336xf16> -> !torch.vtensor<[4096,14336],f16>
%__auto.blk.27.attn_norm.weight = util.global.load @__auto.blk.27.attn_norm.weight : tensor<4096xf32>
%245 = torch_c.from_builtin_tensor %__auto.blk.27.attn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
%__auto.blk.27.attn_q.weight = util.global.load @__auto.blk.27.attn_q.weight : tensor<4096x4096xf16>
%246 = torch_c.from_builtin_tensor %__auto.blk.27.attn_q.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
%__auto.blk.27.attn_k.weight = util.global.load @__auto.blk.27.attn_k.weight : tensor<1024x4096xf16>
%247 = torch_c.from_builtin_tensor %__auto.blk.27.attn_k.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
%__auto.blk.27.attn_v.weight = util.global.load @__auto.blk.27.attn_v.weight : tensor<1024x4096xf16>
%248 = torch_c.from_builtin_tensor %__auto.blk.27.attn_v.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
%__auto.blk.27.attn_output.weight = util.global.load @__auto.blk.27.attn_output.weight : tensor<4096x4096xf16>
%249 = torch_c.from_builtin_tensor %__auto.blk.27.attn_output.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
%__auto.blk.27.ffn_norm.weight = util.global.load @__auto.blk.27.ffn_norm.weight : tensor<4096xf32>
%250 = torch_c.from_builtin_tensor %__auto.blk.27.ffn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
%__auto.blk.27.ffn_gate.weight = util.global.load @__auto.blk.27.ffn_gate.weight : tensor<14336x4096xf16>
%251 = torch_c.from_builtin_tensor %__auto.blk.27.ffn_gate.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
%__auto.blk.27.ffn_up.weight = util.global.load @__auto.blk.27.ffn_up.weight : tensor<14336x4096xf16>
%252 = torch_c.from_builtin_tensor %__auto.blk.27.ffn_up.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
%__auto.blk.27.ffn_down.weight = util.global.load @__auto.blk.27.ffn_down.weight : tensor<4096x14336xf16>
%253 = torch_c.from_builtin_tensor %__auto.blk.27.ffn_down.weight : tensor<4096x14336xf16> -> !torch.vtensor<[4096,14336],f16>
%__auto.blk.28.attn_norm.weight = util.global.load @__auto.blk.28.attn_norm.weight : tensor<4096xf32>
%254 = torch_c.from_builtin_tensor %__auto.blk.28.attn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
%__auto.blk.28.attn_q.weight = util.global.load @__auto.blk.28.attn_q.weight : tensor<4096x4096xf16>
%255 = torch_c.from_builtin_tensor %__auto.blk.28.attn_q.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
%__auto.blk.28.attn_k.weight = util.global.load @__auto.blk.28.attn_k.weight : tensor<1024x4096xf16>
%256 = torch_c.from_builtin_tensor %__auto.blk.28.attn_k.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
%__auto.blk.28.attn_v.weight = util.global.load @__auto.blk.28.attn_v.weight : tensor<1024x4096xf16>
%257 = torch_c.from_builtin_tensor %__auto.blk.28.attn_v.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
%__auto.blk.28.attn_output.weight = util.global.load @__auto.blk.28.attn_output.weight : tensor<4096x4096xf16>
%258 = torch_c.from_builtin_tensor %__auto.blk.28.attn_output.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
%__auto.blk.28.ffn_norm.weight = util.global.load @__auto.blk.28.ffn_norm.weight : tensor<4096xf32>
%259 = torch_c.from_builtin_tensor %__auto.blk.28.ffn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
%__auto.blk.28.ffn_gate.weight = util.global.load @__auto.blk.28.ffn_gate.weight : tensor<14336x4096xf16>
%260 = torch_c.from_builtin_tensor %__auto.blk.28.ffn_gate.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
%__auto.blk.28.ffn_up.weight = util.global.load @__auto.blk.28.ffn_up.weight : tensor<14336x4096xf16>
%261 = torch_c.from_builtin_tensor %__auto.blk.28.ffn_up.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
%__auto.blk.28.ffn_down.weight = util.global.load @__auto.blk.28.ffn_down.weight : tensor<4096x14336xf16>
%262 = torch_c.from_builtin_tensor %__auto.blk.28.ffn_down.weight : tensor<4096x14336xf16> -> !torch.vtensor<[4096,14336],f16>
%__auto.blk.29.attn_norm.weight = util.global.load @__auto.blk.29.attn_norm.weight : tensor<4096xf32>
%263 = torch_c.from_builtin_tensor %__auto.blk.29.attn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
%__auto.blk.29.attn_q.weight = util.global.load @__auto.blk.29.attn_q.weight : tensor<4096x4096xf16>
%264 = torch_c.from_builtin_tensor %__auto.blk.29.attn_q.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
%__auto.blk.29.attn_k.weight = util.global.load @__auto.blk.29.attn_k.weight : tensor<1024x4096xf16>
%265 = torch_c.from_builtin_tensor %__auto.blk.29.attn_k.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
%__auto.blk.29.attn_v.weight = util.global.load @__auto.blk.29.attn_v.weight : tensor<1024x4096xf16>
%266 = torch_c.from_builtin_tensor %__auto.blk.29.attn_v.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
%__auto.blk.29.attn_output.weight = util.global.load @__auto.blk.29.attn_output.weight : tensor<4096x4096xf16>
%267 = torch_c.from_builtin_tensor %__auto.blk.29.attn_output.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
%__auto.blk.29.ffn_norm.weight = util.global.load @__auto.blk.29.ffn_norm.weight : tensor<4096xf32>
%268 = torch_c.from_builtin_tensor %__auto.blk.29.ffn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
%__auto.blk.29.ffn_gate.weight = util.global.load @__auto.blk.29.ffn_gate.weight : tensor<14336x4096xf16>
%269 = torch_c.from_builtin_tensor %__auto.blk.29.ffn_gate.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
%__auto.blk.29.ffn_up.weight = util.global.load @__auto.blk.29.ffn_up.weight : tensor<14336x4096xf16>
%270 = torch_c.from_builtin_tensor %__auto.blk.29.ffn_up.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
%__auto.blk.29.ffn_down.weight = util.global.load @__auto.blk.29.ffn_down.weight : tensor<4096x14336xf16>
%271 = torch_c.from_builtin_tensor %__auto.blk.29.ffn_down.weight : tensor<4096x14336xf16> -> !torch.vtensor<[4096,14336],f16>
%__auto.blk.30.attn_norm.weight = util.global.load @__auto.blk.30.attn_norm.weight : tensor<4096xf32>
%272 = torch_c.from_builtin_tensor %__auto.blk.30.attn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
%__auto.blk.30.attn_q.weight = util.global.load @__auto.blk.30.attn_q.weight : tensor<4096x4096xf16>
%273 = torch_c.from_builtin_tensor %__auto.blk.30.attn_q.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
%__auto.blk.30.attn_k.weight = util.global.load @__auto.blk.30.attn_k.weight : tensor<1024x4096xf16>
%274 = torch_c.from_builtin_tensor %__auto.blk.30.attn_k.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
%__auto.blk.30.attn_v.weight = util.global.load @__auto.blk.30.attn_v.weight : tensor<1024x4096xf16>
%275 = torch_c.from_builtin_tensor %__auto.blk.30.attn_v.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
%__auto.blk.30.attn_output.weight = util.global.load @__auto.blk.30.attn_output.weight : tensor<4096x4096xf16>
%276 = torch_c.from_builtin_tensor %__auto.blk.30.attn_output.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
%__auto.blk.30.ffn_norm.weight = util.global.load @__auto.blk.30.ffn_norm.weight : tensor<4096xf32>
%277 = torch_c.from_builtin_tensor %__auto.blk.30.ffn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
%__auto.blk.30.ffn_gate.weight = util.global.load @__auto.blk.30.ffn_gate.weight : tensor<14336x4096xf16>
%278 = torch_c.from_builtin_tensor %__auto.blk.30.ffn_gate.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
%__auto.blk.30.ffn_up.weight = util.global.load @__auto.blk.30.ffn_up.weight : tensor<14336x4096xf16>
%279 = torch_c.from_builtin_tensor %__auto.blk.30.ffn_up.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
%__auto.blk.30.ffn_down.weight = util.global.load @__auto.blk.30.ffn_down.weight : tensor<4096x14336xf16>
%280 = torch_c.from_builtin_tensor %__auto.blk.30.ffn_down.weight : tensor<4096x14336xf16> -> !torch.vtensor<[4096,14336],f16>
%__auto.blk.31.attn_norm.weight = util.global.load @__auto.blk.31.attn_norm.weight : tensor<4096xf32>
%281 = torch_c.from_builtin_tensor %__auto.blk.31.attn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
%__auto.blk.31.attn_q.weight = util.global.load @__auto.blk.31.attn_q.weight : tensor<4096x4096xf16>
%282 = torch_c.from_builtin_tensor %__auto.blk.31.attn_q.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
%__auto.blk.31.attn_k.weight = util.global.load @__auto.blk.31.attn_k.weight : tensor<1024x4096xf16>
%283 = torch_c.from_builtin_tensor %__auto.blk.31.attn_k.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
%__auto.blk.31.attn_v.weight = util.global.load @__auto.blk.31.attn_v.weight : tensor<1024x4096xf16>
%284 = torch_c.from_builtin_tensor %__auto.blk.31.attn_v.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
%__auto.blk.31.attn_output.weight = util.global.load @__auto.blk.31.attn_output.weight : tensor<4096x4096xf16>
%285 = torch_c.from_builtin_tensor %__auto.blk.31.attn_output.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
%__auto.blk.31.ffn_norm.weight = util.global.load @__auto.blk.31.ffn_norm.weight : tensor<4096xf32>
%286 = torch_c.from_builtin_tensor %__auto.blk.31.ffn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
%__auto.blk.31.ffn_gate.weight = util.global.load @__auto.blk.31.ffn_gate.weight : tensor<14336x4096xf16>
%287 = torch_c.from_builtin_tensor %__auto.blk.31.ffn_gate.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
%__auto.blk.31.ffn_up.weight = util.global.load @__auto.blk.31.ffn_up.weight : tensor<14336x4096xf16>
%288 = torch_c.from_builtin_tensor %__auto.blk.31.ffn_up.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
%__auto.blk.31.ffn_down.weight = util.global.load @__auto.blk.31.ffn_down.weight : tensor<4096x14336xf16>
%289 = torch_c.from_builtin_tensor %__auto.blk.31.ffn_down.weight : tensor<4096x14336xf16> -> !torch.vtensor<[4096,14336],f16>
%__auto.output_norm.weight = util.global.load @__auto.output_norm.weight : tensor<4096xf32>
%290 = torch_c.from_builtin_tensor %__auto.output_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
%__auto.output.weight = util.global.load @__auto.output.weight : tensor<128256x4096xf16>
%291 = torch_c.from_builtin_tensor %__auto.output.weight : tensor<128256x4096xf16> -> !torch.vtensor<[128256,4096],f16>
%292 = torch.copy.to_vtensor %arg3 : !torch.vtensor<[?,1048576],f32>
%int1 = torch.constant.int 1
%293 = torch.aten.size.int %arg0, %int1 : !torch.vtensor<[4,?],si64>, !torch.int -> !torch.int
%int0 = torch.constant.int 0
%int1_0 = torch.constant.int 1
%none = torch.constant.none
%none_1 = torch.constant.none
%cpu = torch.constant.device "cpu"
%false = torch.constant.bool false
%294 = torch.aten.arange.start_step %int0, %293, %int1_0, %none, %none_1, %cpu, %false : !torch.int, !torch.int, !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[?],si64>
%int-1 = torch.constant.int -1
%295 = torch.aten.unsqueeze %arg1, %int-1 : !torch.vtensor<[4],si64>, !torch.int -> !torch.vtensor<[4,1],si64>
%296 = torch.aten.ge.Tensor %294, %295 : !torch.vtensor<[?],si64>, !torch.vtensor<[4,1],si64> -> !torch.vtensor<[4,?],i1>
%__auto.constant_1_1_8192_8192_torch.bool = util.global.load @__auto.constant_1_1_8192_8192_torch.bool : tensor<1x1x8192x8192xi1>
%297 = torch_c.from_builtin_tensor %__auto.constant_1_1_8192_8192_torch.bool : tensor<1x1x8192x8192xi1> -> !torch.vtensor<[1,1,8192,8192],i1>
%int0_2 = torch.constant.int 0
%int0_3 = torch.constant.int 0
%int9223372036854775807 = torch.constant.int 9223372036854775807
%int1_4 = torch.constant.int 1
%298 = torch.aten.slice.Tensor %297, %int0_2, %int0_3, %int9223372036854775807, %int1_4 : !torch.vtensor<[1,1,8192,8192],i1>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,8192,8192],i1>
%int1_5 = torch.constant.int 1
%int0_6 = torch.constant.int 0
%int9223372036854775807_7 = torch.constant.int 9223372036854775807
%int1_8 = torch.constant.int 1
%299 = torch.aten.slice.Tensor %298, %int1_5, %int0_6, %int9223372036854775807_7, %int1_8 : !torch.vtensor<[1,1,8192,8192],i1>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,8192,8192],i1>
%int0_9 = torch.constant.int 0
%300 = torch.aten.size.int %294, %int0_9 : !torch.vtensor<[?],si64>, !torch.int -> !torch.int
%int2 = torch.constant.int 2
%int0_10 = torch.constant.int 0
%int1_11 = torch.constant.int 1
%301 = torch.aten.slice.Tensor %299, %int2, %int0_10, %300, %int1_11 : !torch.vtensor<[1,1,8192,8192],i1>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,?,8192],i1>
%int3 = torch.constant.int 3
%int0_12 = torch.constant.int 0
%int1_13 = torch.constant.int 1
%302 = torch.aten.slice.Tensor %301, %int3, %int0_12, %300, %int1_13 : !torch.vtensor<[1,1,?,8192],i1>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,?,?],i1>
%int0_14 = torch.constant.int 0
%int0_15 = torch.constant.int 0
%int9223372036854775807_16 = torch.constant.int 9223372036854775807
%int1_17 = torch.constant.int 1
%303 = torch.aten.slice.Tensor %296, %int0_14, %int0_15, %int9223372036854775807_16, %int1_17 : !torch.vtensor<[4,?],i1>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[4,?],i1>
%int1_18 = torch.constant.int 1
%304 = torch.aten.unsqueeze %303, %int1_18 : !torch.vtensor<[4,?],i1>, !torch.int -> !torch.vtensor<[4,1,?],i1>
%int2_19 = torch.constant.int 2
%305 = torch.aten.unsqueeze %304, %int2_19 : !torch.vtensor<[4,1,?],i1>, !torch.int -> !torch.vtensor<[4,1,1,?],i1>
%int3_20 = torch.constant.int 3
%int0_21 = torch.constant.int 0
%int9223372036854775807_22 = torch.constant.int 9223372036854775807
%int1_23 = torch.constant.int 1
%306 = torch.aten.slice.Tensor %305, %int3_20, %int0_21, %int9223372036854775807_22, %int1_23 : !torch.vtensor<[4,1,1,?],i1>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[4,1,1,?],i1>
%int1_24 = torch.constant.int 1
%307 = torch.aten.add.Tensor %302, %306, %int1_24 : !torch.vtensor<[1,1,?,?],i1>, !torch.vtensor<[4,1,1,?],i1>, !torch.int -> !torch.vtensor<[4,1,?,?],i1>
%int6 = torch.constant.int 6
%none_25 = torch.constant.none
%none_26 = torch.constant.none
%false_27 = torch.constant.bool false
%none_28 = torch.constant.none
%308 = torch.aten.zeros_like %307, %int6, %none_25, %none_26, %false_27, %none_28 : !torch.vtensor<[4,1,?,?],i1>, !torch.int, !torch.none, !torch.none, !torch.bool, !torch.none -> !torch.vtensor<[4,1,?,?],f32>
%float-Inf = torch.constant.float 0xFFF0000000000000
%int6_29 = torch.constant.int 6
%int0_30 = torch.constant.int 0
%cpu_31 = torch.constant.device "cpu"
%none_32 = torch.constant.none
%309 = torch.aten.scalar_tensor %float-Inf, %int6_29, %int0_30, %cpu_31, %none_32 : !torch.float, !torch.int, !torch.int, !torch.Device, !torch.none -> !torch.vtensor<[],f32>
%310 = torch.aten.where.self %307, %309, %308 : !torch.vtensor<[4,1,?,?],i1>, !torch.vtensor<[],f32>, !torch.vtensor<[4,1,?,?],f32> -> !torch.vtensor<[4,1,?,?],f32>
%int6_33 = torch.constant.int 6
%311 = torch.prims.convert_element_type %0, %int6_33 : !torch.vtensor<[128256,4096],f16>, !torch.int -> !torch.vtensor<[128256,4096],f32>
%int-1_34 = torch.constant.int -1
%false_35 = torch.constant.bool false
%false_36 = torch.constant.bool false
%312 = torch.aten.embedding %311, %arg0, %int-1_34, %false_35, %false_36 : !torch.vtensor<[128256,4096],f32>, !torch.vtensor<[4,?],si64>, !torch.int, !torch.bool, !torch.bool -> !torch.vtensor<[4,?,4096],f32>
return %312 : !torch.vtensor<[4,?,4096],f32>
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment