pashu123 · April 30, 2024 16:12
diff --git a/batch_llama_3_8B.mlir b/batch_llama_3_8B.mlir
 #map = affine_map<(d0, d1, d2) -> (d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 module @module {
  util.global private @__auto.token_embd.weight = #stream.parameter.named<"model"::"token_embd.weight"> : tensor<128256x4096xf16>
  util.global private @__auto.blk.0.attn_norm.weight = #stream.parameter.named<"model"::"blk.0.attn_norm.weight"> : tensor<4096xf32>
  util.global private @__auto.blk.0.attn_q.weight = #stream.parameter.named<"model"::"blk.0.attn_q.weight"> : tensor<4096x4096xf16>
  util.global private @__auto.blk.0.attn_k.weight = #stream.parameter.named<"model"::"blk.0.attn_k.weight"> : tensor<1024x4096xf16>
  util.global private @__auto.blk.0.attn_v.weight = #stream.parameter.named<"model"::"blk.0.attn_v.weight"> : tensor<1024x4096xf16>
  util.global private @__auto.constant_8192_64_torch.complex64 = dense_resource<__auto.constant_8192_64_torch.complex64> : tensor<8192x64xcomplex<f32>>
  util.global private @__auto.blk.0.attn_output.weight = #stream.parameter.named<"model"::"blk.0.attn_output.weight"> : tensor<4096x4096xf16>
  util.global private @__auto.blk.0.ffn_norm.weight = #stream.parameter.named<"model"::"blk.0.ffn_norm.weight"> : tensor<4096xf32>
  util.global private @__auto.blk.0.ffn_gate.weight = #stream.parameter.named<"model"::"blk.0.ffn_gate.weight"> : tensor<14336x4096xf16>
  util.global private @__auto.blk.0.ffn_up.weight = #stream.parameter.named<"model"::"blk.0.ffn_up.weight"> : tensor<14336x4096xf16>
  util.global private @__auto.blk.0.ffn_down.weight = #stream.parameter.named<"model"::"blk.0.ffn_down.weight"> : tensor<4096x14336xf16>
  util.global private @__auto.blk.1.attn_norm.weight = #stream.parameter.named<"model"::"blk.1.attn_norm.weight"> : tensor<4096xf32>
  util.global private @__auto.blk.1.attn_q.weight = #stream.parameter.named<"model"::"blk.1.attn_q.weight"> : tensor<4096x4096xf16>
  util.global private @__auto.blk.1.attn_k.weight = #stream.parameter.named<"model"::"blk.1.attn_k.weight"> : tensor<1024x4096xf16>
  util.global private @__auto.blk.1.attn_v.weight = #stream.parameter.named<"model"::"blk.1.attn_v.weight"> : tensor<1024x4096xf16>
  util.global private @__auto.blk.1.attn_output.weight = #stream.parameter.named<"model"::"blk.1.attn_output.weight"> : tensor<4096x4096xf16>
  util.global private @__auto.blk.1.ffn_norm.weight = #stream.parameter.named<"model"::"blk.1.ffn_norm.weight"> : tensor<4096xf32>
  util.global private @__auto.blk.1.ffn_gate.weight = #stream.parameter.named<"model"::"blk.1.ffn_gate.weight"> : tensor<14336x4096xf16>
  util.global private @__auto.blk.1.ffn_up.weight = #stream.parameter.named<"model"::"blk.1.ffn_up.weight"> : tensor<14336x4096xf16>
  util.global private @__auto.blk.1.ffn_down.weight = #stream.parameter.named<"model"::"blk.1.ffn_down.weight"> : tensor<4096x14336xf16>
  util.global private @__auto.blk.2.attn_norm.weight = #stream.parameter.named<"model"::"blk.2.attn_norm.weight"> : tensor<4096xf32>
  util.global private @__auto.blk.2.attn_q.weight = #stream.parameter.named<"model"::"blk.2.attn_q.weight"> : tensor<4096x4096xf16>
  util.global private @__auto.blk.2.attn_k.weight = #stream.parameter.named<"model"::"blk.2.attn_k.weight"> : tensor<1024x4096xf16>
  util.global private @__auto.blk.2.attn_v.weight = #stream.parameter.named<"model"::"blk.2.attn_v.weight"> : tensor<1024x4096xf16>
  util.global private @__auto.blk.2.attn_output.weight = #stream.parameter.named<"model"::"blk.2.attn_output.weight"> : tensor<4096x4096xf16>
  util.global private @__auto.blk.2.ffn_norm.weight = #stream.parameter.named<"model"::"blk.2.ffn_norm.weight"> : tensor<4096xf32>
  util.global private @__auto.blk.2.ffn_gate.weight = #stream.parameter.named<"model"::"blk.2.ffn_gate.weight"> : tensor<14336x4096xf16>
  util.global private @__auto.blk.2.ffn_up.weight = #stream.parameter.named<"model"::"blk.2.ffn_up.weight"> : tensor<14336x4096xf16>
  util.global private @__auto.blk.2.ffn_down.weight = #stream.parameter.named<"model"::"blk.2.ffn_down.weight"> : tensor<4096x14336xf16>
  util.global private @__auto.blk.3.attn_norm.weight = #stream.parameter.named<"model"::"blk.3.attn_norm.weight"> : tensor<4096xf32>
  util.global private @__auto.blk.3.attn_q.weight = #stream.parameter.named<"model"::"blk.3.attn_q.weight"> : tensor<4096x4096xf16>
  util.global private @__auto.blk.3.attn_k.weight = #stream.parameter.named<"model"::"blk.3.attn_k.weight"> : tensor<1024x4096xf16>
  util.global private @__auto.blk.3.attn_v.weight = #stream.parameter.named<"model"::"blk.3.attn_v.weight"> : tensor<1024x4096xf16>
  util.global private @__auto.blk.3.attn_output.weight = #stream.parameter.named<"model"::"blk.3.attn_output.weight"> : tensor<4096x4096xf16>
  util.global private @__auto.blk.3.ffn_norm.weight = #stream.parameter.named<"model"::"blk.3.ffn_norm.weight"> : tensor<4096xf32>
  util.global private @__auto.blk.3.ffn_gate.weight = #stream.parameter.named<"model"::"blk.3.ffn_gate.weight"> : tensor<14336x4096xf16>
  util.global private @__auto.blk.3.ffn_up.weight = #stream.parameter.named<"model"::"blk.3.ffn_up.weight"> : tensor<14336x4096xf16>
  util.global private @__auto.blk.3.ffn_down.weight = #stream.parameter.named<"model"::"blk.3.ffn_down.weight"> : tensor<4096x14336xf16>
  util.global private @__auto.blk.4.attn_norm.weight = #stream.parameter.named<"model"::"blk.4.attn_norm.weight"> : tensor<4096xf32>
  util.global private @__auto.blk.4.attn_q.weight = #stream.parameter.named<"model"::"blk.4.attn_q.weight"> : tensor<4096x4096xf16>
  util.global private @__auto.blk.4.attn_k.weight = #stream.parameter.named<"model"::"blk.4.attn_k.weight"> : tensor<1024x4096xf16>
  util.global private @__auto.blk.4.attn_v.weight = #stream.parameter.named<"model"::"blk.4.attn_v.weight"> : tensor<1024x4096xf16>
  util.global private @__auto.blk.4.attn_output.weight = #stream.parameter.named<"model"::"blk.4.attn_output.weight"> : tensor<4096x4096xf16>
  util.global private @__auto.blk.4.ffn_norm.weight = #stream.parameter.named<"model"::"blk.4.ffn_norm.weight"> : tensor<4096xf32>
  util.global private @__auto.blk.4.ffn_gate.weight = #stream.parameter.named<"model"::"blk.4.ffn_gate.weight"> : tensor<14336x4096xf16>
  util.global private @__auto.blk.4.ffn_up.weight = #stream.parameter.named<"model"::"blk.4.ffn_up.weight"> : tensor<14336x4096xf16>
  util.global private @__auto.blk.4.ffn_down.weight = #stream.parameter.named<"model"::"blk.4.ffn_down.weight"> : tensor<4096x14336xf16>
  util.global private @__auto.blk.5.attn_norm.weight = #stream.parameter.named<"model"::"blk.5.attn_norm.weight"> : tensor<4096xf32>
  util.global private @__auto.blk.5.attn_q.weight = #stream.parameter.named<"model"::"blk.5.attn_q.weight"> : tensor<4096x4096xf16>
  util.global private @__auto.blk.5.attn_k.weight = #stream.parameter.named<"model"::"blk.5.attn_k.weight"> : tensor<1024x4096xf16>
  util.global private @__auto.blk.5.attn_v.weight = #stream.parameter.named<"model"::"blk.5.attn_v.weight"> : tensor<1024x4096xf16>
  util.global private @__auto.blk.5.attn_output.weight = #stream.parameter.named<"model"::"blk.5.attn_output.weight"> : tensor<4096x4096xf16>
  util.global private @__auto.blk.5.ffn_norm.weight = #stream.parameter.named<"model"::"blk.5.ffn_norm.weight"> : tensor<4096xf32>
  util.global private @__auto.blk.5.ffn_gate.weight = #stream.parameter.named<"model"::"blk.5.ffn_gate.weight"> : tensor<14336x4096xf16>
  util.global private @__auto.blk.5.ffn_up.weight = #stream.parameter.named<"model"::"blk.5.ffn_up.weight"> : tensor<14336x4096xf16>
  util.global private @__auto.blk.5.ffn_down.weight = #stream.parameter.named<"model"::"blk.5.ffn_down.weight"> : tensor<4096x14336xf16>
  util.global private @__auto.blk.6.attn_norm.weight = #stream.parameter.named<"model"::"blk.6.attn_norm.weight"> : tensor<4096xf32>
  util.global private @__auto.blk.6.attn_q.weight = #stream.parameter.named<"model"::"blk.6.attn_q.weight"> : tensor<4096x4096xf16>
  util.global private @__auto.blk.6.attn_k.weight = #stream.parameter.named<"model"::"blk.6.attn_k.weight"> : tensor<1024x4096xf16>
  util.global private @__auto.blk.6.attn_v.weight = #stream.parameter.named<"model"::"blk.6.attn_v.weight"> : tensor<1024x4096xf16>
  util.global private @__auto.blk.6.attn_output.weight = #stream.parameter.named<"model"::"blk.6.attn_output.weight"> : tensor<4096x4096xf16>
  util.global private @__auto.blk.6.ffn_norm.weight = #stream.parameter.named<"model"::"blk.6.ffn_norm.weight"> : tensor<4096xf32>
  util.global private @__auto.blk.6.ffn_gate.weight = #stream.parameter.named<"model"::"blk.6.ffn_gate.weight"> : tensor<14336x4096xf16>
  util.global private @__auto.blk.6.ffn_up.weight = #stream.parameter.named<"model"::"blk.6.ffn_up.weight"> : tensor<14336x4096xf16>
  util.global private @__auto.blk.6.ffn_down.weight = #stream.parameter.named<"model"::"blk.6.ffn_down.weight"> : tensor<4096x14336xf16>
  util.global private @__auto.blk.7.attn_norm.weight = #stream.parameter.named<"model"::"blk.7.attn_norm.weight"> : tensor<4096xf32>
  util.global private @__auto.blk.7.attn_q.weight = #stream.parameter.named<"model"::"blk.7.attn_q.weight"> : tensor<4096x4096xf16>
  util.global private @__auto.blk.7.attn_k.weight = #stream.parameter.named<"model"::"blk.7.attn_k.weight"> : tensor<1024x4096xf16>
  util.global private @__auto.blk.7.attn_v.weight = #stream.parameter.named<"model"::"blk.7.attn_v.weight"> : tensor<1024x4096xf16>
  util.global private @__auto.blk.7.attn_output.weight = #stream.parameter.named<"model"::"blk.7.attn_output.weight"> : tensor<4096x4096xf16>
  util.global private @__auto.blk.7.ffn_norm.weight = #stream.parameter.named<"model"::"blk.7.ffn_norm.weight"> : tensor<4096xf32>
  util.global private @__auto.blk.7.ffn_gate.weight = #stream.parameter.named<"model"::"blk.7.ffn_gate.weight"> : tensor<14336x4096xf16>
  util.global private @__auto.blk.7.ffn_up.weight = #stream.parameter.named<"model"::"blk.7.ffn_up.weight"> : tensor<14336x4096xf16>
  util.global private @__auto.blk.7.ffn_down.weight = #stream.parameter.named<"model"::"blk.7.ffn_down.weight"> : tensor<4096x14336xf16>
  util.global private @__auto.blk.8.attn_norm.weight = #stream.parameter.named<"model"::"blk.8.attn_norm.weight"> : tensor<4096xf32>
  util.global private @__auto.blk.8.attn_q.weight = #stream.parameter.named<"model"::"blk.8.attn_q.weight"> : tensor<4096x4096xf16>
  util.global private @__auto.blk.8.attn_k.weight = #stream.parameter.named<"model"::"blk.8.attn_k.weight"> : tensor<1024x4096xf16>
  util.global private @__auto.blk.8.attn_v.weight = #stream.parameter.named<"model"::"blk.8.attn_v.weight"> : tensor<1024x4096xf16>
  util.global private @__auto.blk.8.attn_output.weight = #stream.parameter.named<"model"::"blk.8.attn_output.weight"> : tensor<4096x4096xf16>
  util.global private @__auto.blk.8.ffn_norm.weight = #stream.parameter.named<"model"::"blk.8.ffn_norm.weight"> : tensor<4096xf32>
  util.global private @__auto.blk.8.ffn_gate.weight = #stream.parameter.named<"model"::"blk.8.ffn_gate.weight"> : tensor<14336x4096xf16>
  util.global private @__auto.blk.8.ffn_up.weight = #stream.parameter.named<"model"::"blk.8.ffn_up.weight"> : tensor<14336x4096xf16>
  util.global private @__auto.blk.8.ffn_down.weight = #stream.parameter.named<"model"::"blk.8.ffn_down.weight"> : tensor<4096x14336xf16>
  util.global private @__auto.blk.9.attn_norm.weight = #stream.parameter.named<"model"::"blk.9.attn_norm.weight"> : tensor<4096xf32>
  util.global private @__auto.blk.9.attn_q.weight = #stream.parameter.named<"model"::"blk.9.attn_q.weight"> : tensor<4096x4096xf16>
  util.global private @__auto.blk.9.attn_k.weight = #stream.parameter.named<"model"::"blk.9.attn_k.weight"> : tensor<1024x4096xf16>
  util.global private @__auto.blk.9.attn_v.weight = #stream.parameter.named<"model"::"blk.9.attn_v.weight"> : tensor<1024x4096xf16>
  util.global private @__auto.blk.9.attn_output.weight = #stream.parameter.named<"model"::"blk.9.attn_output.weight"> : tensor<4096x4096xf16>
  util.global private @__auto.blk.9.ffn_norm.weight = #stream.parameter.named<"model"::"blk.9.ffn_norm.weight"> : tensor<4096xf32>
  util.global private @__auto.blk.9.ffn_gate.weight = #stream.parameter.named<"model"::"blk.9.ffn_gate.weight"> : tensor<14336x4096xf16>
  util.global private @__auto.blk.9.ffn_up.weight = #stream.parameter.named<"model"::"blk.9.ffn_up.weight"> : tensor<14336x4096xf16>
  util.global private @__auto.blk.9.ffn_down.weight = #stream.parameter.named<"model"::"blk.9.ffn_down.weight"> : tensor<4096x14336xf16>
  util.global private @__auto.blk.10.attn_norm.weight = #stream.parameter.named<"model"::"blk.10.attn_norm.weight"> : tensor<4096xf32>
  util.global private @__auto.blk.10.attn_q.weight = #stream.parameter.named<"model"::"blk.10.attn_q.weight"> : tensor<4096x4096xf16>
  util.global private @__auto.blk.10.attn_k.weight = #stream.parameter.named<"model"::"blk.10.attn_k.weight"> : tensor<1024x4096xf16>
  util.global private @__auto.blk.10.attn_v.weight = #stream.parameter.named<"model"::"blk.10.attn_v.weight"> : tensor<1024x4096xf16>
  util.global private @__auto.blk.10.attn_output.weight = #stream.parameter.named<"model"::"blk.10.attn_output.weight"> : tensor<4096x4096xf16>
  util.global private @__auto.blk.10.ffn_norm.weight = #stream.parameter.named<"model"::"blk.10.ffn_norm.weight"> : tensor<4096xf32>
  util.global private @__auto.blk.10.ffn_gate.weight = #stream.parameter.named<"model"::"blk.10.ffn_gate.weight"> : tensor<14336x4096xf16>
  util.global private @__auto.blk.10.ffn_up.weight = #stream.parameter.named<"model"::"blk.10.ffn_up.weight"> : tensor<14336x4096xf16>
  util.global private @__auto.blk.10.ffn_down.weight = #stream.parameter.named<"model"::"blk.10.ffn_down.weight"> : tensor<4096x14336xf16>
  util.global private @__auto.blk.11.attn_norm.weight = #stream.parameter.named<"model"::"blk.11.attn_norm.weight"> : tensor<4096xf32>
  util.global private @__auto.blk.11.attn_q.weight = #stream.parameter.named<"model"::"blk.11.attn_q.weight"> : tensor<4096x4096xf16>
  util.global private @__auto.blk.11.attn_k.weight = #stream.parameter.named<"model"::"blk.11.attn_k.weight"> : tensor<1024x4096xf16>
  util.global private @__auto.blk.11.attn_v.weight = #stream.parameter.named<"model"::"blk.11.attn_v.weight"> : tensor<1024x4096xf16>
  util.global private @__auto.blk.11.attn_output.weight = #stream.parameter.named<"model"::"blk.11.attn_output.weight"> : tensor<4096x4096xf16>
  util.global private @__auto.blk.11.ffn_norm.weight = #stream.parameter.named<"model"::"blk.11.ffn_norm.weight"> : tensor<4096xf32>
  util.global private @__auto.blk.11.ffn_gate.weight = #stream.parameter.named<"model"::"blk.11.ffn_gate.weight"> : tensor<14336x4096xf16>
  util.global private @__auto.blk.11.ffn_up.weight = #stream.parameter.named<"model"::"blk.11.ffn_up.weight"> : tensor<14336x4096xf16>
  util.global private @__auto.blk.11.ffn_down.weight = #stream.parameter.named<"model"::"blk.11.ffn_down.weight"> : tensor<4096x14336xf16>
  util.global private @__auto.blk.12.attn_norm.weight = #stream.parameter.named<"model"::"blk.12.attn_norm.weight"> : tensor<4096xf32>
  util.global private @__auto.blk.12.attn_q.weight = #stream.parameter.named<"model"::"blk.12.attn_q.weight"> : tensor<4096x4096xf16>
  util.global private @__auto.blk.12.attn_k.weight = #stream.parameter.named<"model"::"blk.12.attn_k.weight"> : tensor<1024x4096xf16>
  util.global private @__auto.blk.12.attn_v.weight = #stream.parameter.named<"model"::"blk.12.attn_v.weight"> : tensor<1024x4096xf16>
  util.global private @__auto.blk.12.attn_output.weight = #stream.parameter.named<"model"::"blk.12.attn_output.weight"> : tensor<4096x4096xf16>
  util.global private @__auto.blk.12.ffn_norm.weight = #stream.parameter.named<"model"::"blk.12.ffn_norm.weight"> : tensor<4096xf32>
  util.global private @__auto.blk.12.ffn_gate.weight = #stream.parameter.named<"model"::"blk.12.ffn_gate.weight"> : tensor<14336x4096xf16>
  util.global private @__auto.blk.12.ffn_up.weight = #stream.parameter.named<"model"::"blk.12.ffn_up.weight"> : tensor<14336x4096xf16>
  util.global private @__auto.blk.12.ffn_down.weight = #stream.parameter.named<"model"::"blk.12.ffn_down.weight"> : tensor<4096x14336xf16>
  util.global private @__auto.blk.13.attn_norm.weight = #stream.parameter.named<"model"::"blk.13.attn_norm.weight"> : tensor<4096xf32>
  util.global private @__auto.blk.13.attn_q.weight = #stream.parameter.named<"model"::"blk.13.attn_q.weight"> : tensor<4096x4096xf16>
  util.global private @__auto.blk.13.attn_k.weight = #stream.parameter.named<"model"::"blk.13.attn_k.weight"> : tensor<1024x4096xf16>
  util.global private @__auto.blk.13.attn_v.weight = #stream.parameter.named<"model"::"blk.13.attn_v.weight"> : tensor<1024x4096xf16>
  util.global private @__auto.blk.13.attn_output.weight = #stream.parameter.named<"model"::"blk.13.attn_output.weight"> : tensor<4096x4096xf16>
  util.global private @__auto.blk.13.ffn_norm.weight = #stream.parameter.named<"model"::"blk.13.ffn_norm.weight"> : tensor<4096xf32>
  util.global private @__auto.blk.13.ffn_gate.weight = #stream.parameter.named<"model"::"blk.13.ffn_gate.weight"> : tensor<14336x4096xf16>
  util.global private @__auto.blk.13.ffn_up.weight = #stream.parameter.named<"model"::"blk.13.ffn_up.weight"> : tensor<14336x4096xf16>
  util.global private @__auto.blk.13.ffn_down.weight = #stream.parameter.named<"model"::"blk.13.ffn_down.weight"> : tensor<4096x14336xf16>
  util.global private @__auto.blk.14.attn_norm.weight = #stream.parameter.named<"model"::"blk.14.attn_norm.weight"> : tensor<4096xf32>
  util.global private @__auto.blk.14.attn_q.weight = #stream.parameter.named<"model"::"blk.14.attn_q.weight"> : tensor<4096x4096xf16>
  util.global private @__auto.blk.14.attn_k.weight = #stream.parameter.named<"model"::"blk.14.attn_k.weight"> : tensor<1024x4096xf16>
  util.global private @__auto.blk.14.attn_v.weight = #stream.parameter.named<"model"::"blk.14.attn_v.weight"> : tensor<1024x4096xf16>
  util.global private @__auto.blk.14.attn_output.weight = #stream.parameter.named<"model"::"blk.14.attn_output.weight"> : tensor<4096x4096xf16>
  util.global private @__auto.blk.14.ffn_norm.weight = #stream.parameter.named<"model"::"blk.14.ffn_norm.weight"> : tensor<4096xf32>
  util.global private @__auto.blk.14.ffn_gate.weight = #stream.parameter.named<"model"::"blk.14.ffn_gate.weight"> : tensor<14336x4096xf16>
  util.global private @__auto.blk.14.ffn_up.weight = #stream.parameter.named<"model"::"blk.14.ffn_up.weight"> : tensor<14336x4096xf16>
  util.global private @__auto.blk.14.ffn_down.weight = #stream.parameter.named<"model"::"blk.14.ffn_down.weight"> : tensor<4096x14336xf16>
  util.global private @__auto.blk.15.attn_norm.weight = #stream.parameter.named<"model"::"blk.15.attn_norm.weight"> : tensor<4096xf32>
  util.global private @__auto.blk.15.attn_q.weight = #stream.parameter.named<"model"::"blk.15.attn_q.weight"> : tensor<4096x4096xf16>
  util.global private @__auto.blk.15.attn_k.weight = #stream.parameter.named<"model"::"blk.15.attn_k.weight"> : tensor<1024x4096xf16>
  util.global private @__auto.blk.15.attn_v.weight = #stream.parameter.named<"model"::"blk.15.attn_v.weight"> : tensor<1024x4096xf16>
  util.global private @__auto.blk.15.attn_output.weight = #stream.parameter.named<"model"::"blk.15.attn_output.weight"> : tensor<4096x4096xf16>
  util.global private @__auto.blk.15.ffn_norm.weight = #stream.parameter.named<"model"::"blk.15.ffn_norm.weight"> : tensor<4096xf32>
  util.global private @__auto.blk.15.ffn_gate.weight = #stream.parameter.named<"model"::"blk.15.ffn_gate.weight"> : tensor<14336x4096xf16>
  util.global private @__auto.blk.15.ffn_up.weight = #stream.parameter.named<"model"::"blk.15.ffn_up.weight"> : tensor<14336x4096xf16>
  util.global private @__auto.blk.15.ffn_down.weight = #stream.parameter.named<"model"::"blk.15.ffn_down.weight"> : tensor<4096x14336xf16>
  util.global private @__auto.blk.16.attn_norm.weight = #stream.parameter.named<"model"::"blk.16.attn_norm.weight"> : tensor<4096xf32>
  util.global private @__auto.blk.16.attn_q.weight = #stream.parameter.named<"model"::"blk.16.attn_q.weight"> : tensor<4096x4096xf16>
  util.global private @__auto.blk.16.attn_k.weight = #stream.parameter.named<"model"::"blk.16.attn_k.weight"> : tensor<1024x4096xf16>
  util.global private @__auto.blk.16.attn_v.weight = #stream.parameter.named<"model"::"blk.16.attn_v.weight"> : tensor<1024x4096xf16>
  util.global private @__auto.blk.16.attn_output.weight = #stream.parameter.named<"model"::"blk.16.attn_output.weight"> : tensor<4096x4096xf16>
  util.global private @__auto.blk.16.ffn_norm.weight = #stream.parameter.named<"model"::"blk.16.ffn_norm.weight"> : tensor<4096xf32>
  util.global private @__auto.blk.16.ffn_gate.weight = #stream.parameter.named<"model"::"blk.16.ffn_gate.weight"> : tensor<14336x4096xf16>
  util.global private @__auto.blk.16.ffn_up.weight = #stream.parameter.named<"model"::"blk.16.ffn_up.weight"> : tensor<14336x4096xf16>
  util.global private @__auto.blk.16.ffn_down.weight = #stream.parameter.named<"model"::"blk.16.ffn_down.weight"> : tensor<4096x14336xf16>
  util.global private @__auto.blk.17.attn_norm.weight = #stream.parameter.named<"model"::"blk.17.attn_norm.weight"> : tensor<4096xf32>
  util.global private @__auto.blk.17.attn_q.weight = #stream.parameter.named<"model"::"blk.17.attn_q.weight"> : tensor<4096x4096xf16>
  util.global private @__auto.blk.17.attn_k.weight = #stream.parameter.named<"model"::"blk.17.attn_k.weight"> : tensor<1024x4096xf16>
  util.global private @__auto.blk.17.attn_v.weight = #stream.parameter.named<"model"::"blk.17.attn_v.weight"> : tensor<1024x4096xf16>
  util.global private @__auto.blk.17.attn_output.weight = #stream.parameter.named<"model"::"blk.17.attn_output.weight"> : tensor<4096x4096xf16>
  util.global private @__auto.blk.17.ffn_norm.weight = #stream.parameter.named<"model"::"blk.17.ffn_norm.weight"> : tensor<4096xf32>
  util.global private @__auto.blk.17.ffn_gate.weight = #stream.parameter.named<"model"::"blk.17.ffn_gate.weight"> : tensor<14336x4096xf16>
  util.global private @__auto.blk.17.ffn_up.weight = #stream.parameter.named<"model"::"blk.17.ffn_up.weight"> : tensor<14336x4096xf16>
  util.global private @__auto.blk.17.ffn_down.weight = #stream.parameter.named<"model"::"blk.17.ffn_down.weight"> : tensor<4096x14336xf16>
  util.global private @__auto.blk.18.attn_norm.weight = #stream.parameter.named<"model"::"blk.18.attn_norm.weight"> : tensor<4096xf32>
  util.global private @__auto.blk.18.attn_q.weight = #stream.parameter.named<"model"::"blk.18.attn_q.weight"> : tensor<4096x4096xf16>
  util.global private @__auto.blk.18.attn_k.weight = #stream.parameter.named<"model"::"blk.18.attn_k.weight"> : tensor<1024x4096xf16>
  util.global private @__auto.blk.18.attn_v.weight = #stream.parameter.named<"model"::"blk.18.attn_v.weight"> : tensor<1024x4096xf16>
  util.global private @__auto.blk.18.attn_output.weight = #stream.parameter.named<"model"::"blk.18.attn_output.weight"> : tensor<4096x4096xf16>
  util.global private @__auto.blk.18.ffn_norm.weight = #stream.parameter.named<"model"::"blk.18.ffn_norm.weight"> : tensor<4096xf32>
  util.global private @__auto.blk.18.ffn_gate.weight = #stream.parameter.named<"model"::"blk.18.ffn_gate.weight"> : tensor<14336x4096xf16>
  util.global private @__auto.blk.18.ffn_up.weight = #stream.parameter.named<"model"::"blk.18.ffn_up.weight"> : tensor<14336x4096xf16>
  util.global private @__auto.blk.18.ffn_down.weight = #stream.parameter.named<"model"::"blk.18.ffn_down.weight"> : tensor<4096x14336xf16>
  util.global private @__auto.blk.19.attn_norm.weight = #stream.parameter.named<"model"::"blk.19.attn_norm.weight"> : tensor<4096xf32>
  util.global private @__auto.blk.19.attn_q.weight = #stream.parameter.named<"model"::"blk.19.attn_q.weight"> : tensor<4096x4096xf16>
  util.global private @__auto.blk.19.attn_k.weight = #stream.parameter.named<"model"::"blk.19.attn_k.weight"> : tensor<1024x4096xf16>
  util.global private @__auto.blk.19.attn_v.weight = #stream.parameter.named<"model"::"blk.19.attn_v.weight"> : tensor<1024x4096xf16>
  util.global private @__auto.blk.19.attn_output.weight = #stream.parameter.named<"model"::"blk.19.attn_output.weight"> : tensor<4096x4096xf16>
  util.global private @__auto.blk.19.ffn_norm.weight = #stream.parameter.named<"model"::"blk.19.ffn_norm.weight"> : tensor<4096xf32>
  util.global private @__auto.blk.19.ffn_gate.weight = #stream.parameter.named<"model"::"blk.19.ffn_gate.weight"> : tensor<14336x4096xf16>
  util.global private @__auto.blk.19.ffn_up.weight = #stream.parameter.named<"model"::"blk.19.ffn_up.weight"> : tensor<14336x4096xf16>
  util.global private @__auto.blk.19.ffn_down.weight = #stream.parameter.named<"model"::"blk.19.ffn_down.weight"> : tensor<4096x14336xf16>
  util.global private @__auto.blk.20.attn_norm.weight = #stream.parameter.named<"model"::"blk.20.attn_norm.weight"> : tensor<4096xf32>
  util.global private @__auto.blk.20.attn_q.weight = #stream.parameter.named<"model"::"blk.20.attn_q.weight"> : tensor<4096x4096xf16>
  util.global private @__auto.blk.20.attn_k.weight = #stream.parameter.named<"model"::"blk.20.attn_k.weight"> : tensor<1024x4096xf16>
  util.global private @__auto.blk.20.attn_v.weight = #stream.parameter.named<"model"::"blk.20.attn_v.weight"> : tensor<1024x4096xf16>
  util.global private @__auto.blk.20.attn_output.weight = #stream.parameter.named<"model"::"blk.20.attn_output.weight"> : tensor<4096x4096xf16>
  util.global private @__auto.blk.20.ffn_norm.weight = #stream.parameter.named<"model"::"blk.20.ffn_norm.weight"> : tensor<4096xf32>
  util.global private @__auto.blk.20.ffn_gate.weight = #stream.parameter.named<"model"::"blk.20.ffn_gate.weight"> : tensor<14336x4096xf16>
  util.global private @__auto.blk.20.ffn_up.weight = #stream.parameter.named<"model"::"blk.20.ffn_up.weight"> : tensor<14336x4096xf16>
  util.global private @__auto.blk.20.ffn_down.weight = #stream.parameter.named<"model"::"blk.20.ffn_down.weight"> : tensor<4096x14336xf16>
  util.global private @__auto.blk.21.attn_norm.weight = #stream.parameter.named<"model"::"blk.21.attn_norm.weight"> : tensor<4096xf32>
  util.global private @__auto.blk.21.attn_q.weight = #stream.parameter.named<"model"::"blk.21.attn_q.weight"> : tensor<4096x4096xf16>
  util.global private @__auto.blk.21.attn_k.weight = #stream.parameter.named<"model"::"blk.21.attn_k.weight"> : tensor<1024x4096xf16>
  util.global private @__auto.blk.21.attn_v.weight = #stream.parameter.named<"model"::"blk.21.attn_v.weight"> : tensor<1024x4096xf16>
  util.global private @__auto.blk.21.attn_output.weight = #stream.parameter.named<"model"::"blk.21.attn_output.weight"> : tensor<4096x4096xf16>
  util.global private @__auto.blk.21.ffn_norm.weight = #stream.parameter.named<"model"::"blk.21.ffn_norm.weight"> : tensor<4096xf32>
  util.global private @__auto.blk.21.ffn_gate.weight = #stream.parameter.named<"model"::"blk.21.ffn_gate.weight"> : tensor<14336x4096xf16>
  util.global private @__auto.blk.21.ffn_up.weight = #stream.parameter.named<"model"::"blk.21.ffn_up.weight"> : tensor<14336x4096xf16>
  util.global private @__auto.blk.21.ffn_down.weight = #stream.parameter.named<"model"::"blk.21.ffn_down.weight"> : tensor<4096x14336xf16>
  util.global private @__auto.blk.22.attn_norm.weight = #stream.parameter.named<"model"::"blk.22.attn_norm.weight"> : tensor<4096xf32>
  util.global private @__auto.blk.22.attn_q.weight = #stream.parameter.named<"model"::"blk.22.attn_q.weight"> : tensor<4096x4096xf16>
  util.global private @__auto.blk.22.attn_k.weight = #stream.parameter.named<"model"::"blk.22.attn_k.weight"> : tensor<1024x4096xf16>
  util.global private @__auto.blk.22.attn_v.weight = #stream.parameter.named<"model"::"blk.22.attn_v.weight"> : tensor<1024x4096xf16>
  util.global private @__auto.blk.22.attn_output.weight = #stream.parameter.named<"model"::"blk.22.attn_output.weight"> : tensor<4096x4096xf16>
  util.global private @__auto.blk.22.ffn_norm.weight = #stream.parameter.named<"model"::"blk.22.ffn_norm.weight"> : tensor<4096xf32>
  util.global private @__auto.blk.22.ffn_gate.weight = #stream.parameter.named<"model"::"blk.22.ffn_gate.weight"> : tensor<14336x4096xf16>
  util.global private @__auto.blk.22.ffn_up.weight = #stream.parameter.named<"model"::"blk.22.ffn_up.weight"> : tensor<14336x4096xf16>
  util.global private @__auto.blk.22.ffn_down.weight = #stream.parameter.named<"model"::"blk.22.ffn_down.weight"> : tensor<4096x14336xf16>
  util.global private @__auto.blk.23.attn_norm.weight = #stream.parameter.named<"model"::"blk.23.attn_norm.weight"> : tensor<4096xf32>
  util.global private @__auto.blk.23.attn_q.weight = #stream.parameter.named<"model"::"blk.23.attn_q.weight"> : tensor<4096x4096xf16>
  util.global private @__auto.blk.23.attn_k.weight = #stream.parameter.named<"model"::"blk.23.attn_k.weight"> : tensor<1024x4096xf16>
  util.global private @__auto.blk.23.attn_v.weight = #stream.parameter.named<"model"::"blk.23.attn_v.weight"> : tensor<1024x4096xf16>
  util.global private @__auto.blk.23.attn_output.weight = #stream.parameter.named<"model"::"blk.23.attn_output.weight"> : tensor<4096x4096xf16>
  util.global private @__auto.blk.23.ffn_norm.weight = #stream.parameter.named<"model"::"blk.23.ffn_norm.weight"> : tensor<4096xf32>
  util.global private @__auto.blk.23.ffn_gate.weight = #stream.parameter.named<"model"::"blk.23.ffn_gate.weight"> : tensor<14336x4096xf16>
  util.global private @__auto.blk.23.ffn_up.weight = #stream.parameter.named<"model"::"blk.23.ffn_up.weight"> : tensor<14336x4096xf16>
  util.global private @__auto.blk.23.ffn_down.weight = #stream.parameter.named<"model"::"blk.23.ffn_down.weight"> : tensor<4096x14336xf16>
  util.global private @__auto.blk.24.attn_norm.weight = #stream.parameter.named<"model"::"blk.24.attn_norm.weight"> : tensor<4096xf32>
  util.global private @__auto.blk.24.attn_q.weight = #stream.parameter.named<"model"::"blk.24.attn_q.weight"> : tensor<4096x4096xf16>
  util.global private @__auto.blk.24.attn_k.weight = #stream.parameter.named<"model"::"blk.24.attn_k.weight"> : tensor<1024x4096xf16>
  util.global private @__auto.blk.24.attn_v.weight = #stream.parameter.named<"model"::"blk.24.attn_v.weight"> : tensor<1024x4096xf16>
  util.global private @__auto.blk.24.attn_output.weight = #stream.parameter.named<"model"::"blk.24.attn_output.weight"> : tensor<4096x4096xf16>
  util.global private @__auto.blk.24.ffn_norm.weight = #stream.parameter.named<"model"::"blk.24.ffn_norm.weight"> : tensor<4096xf32>
  util.global private @__auto.blk.24.ffn_gate.weight = #stream.parameter.named<"model"::"blk.24.ffn_gate.weight"> : tensor<14336x4096xf16>
  util.global private @__auto.blk.24.ffn_up.weight = #stream.parameter.named<"model"::"blk.24.ffn_up.weight"> : tensor<14336x4096xf16>
  util.global private @__auto.blk.24.ffn_down.weight = #stream.parameter.named<"model"::"blk.24.ffn_down.weight"> : tensor<4096x14336xf16>
  util.global private @__auto.blk.25.attn_norm.weight = #stream.parameter.named<"model"::"blk.25.attn_norm.weight"> : tensor<4096xf32>
  util.global private @__auto.blk.25.attn_q.weight = #stream.parameter.named<"model"::"blk.25.attn_q.weight"> : tensor<4096x4096xf16>
  util.global private @__auto.blk.25.attn_k.weight = #stream.parameter.named<"model"::"blk.25.attn_k.weight"> : tensor<1024x4096xf16>
  util.global private @__auto.blk.25.attn_v.weight = #stream.parameter.named<"model"::"blk.25.attn_v.weight"> : tensor<1024x4096xf16>
  util.global private @__auto.blk.25.attn_output.weight = #stream.parameter.named<"model"::"blk.25.attn_output.weight"> : tensor<4096x4096xf16>
  util.global private @__auto.blk.25.ffn_norm.weight = #stream.parameter.named<"model"::"blk.25.ffn_norm.weight"> : tensor<4096xf32>
  util.global private @__auto.blk.25.ffn_gate.weight = #stream.parameter.named<"model"::"blk.25.ffn_gate.weight"> : tensor<14336x4096xf16>
  util.global private @__auto.blk.25.ffn_up.weight = #stream.parameter.named<"model"::"blk.25.ffn_up.weight"> : tensor<14336x4096xf16>
  util.global private @__auto.blk.25.ffn_down.weight = #stream.parameter.named<"model"::"blk.25.ffn_down.weight"> : tensor<4096x14336xf16>
  util.global private @__auto.blk.26.attn_norm.weight = #stream.parameter.named<"model"::"blk.26.attn_norm.weight"> : tensor<4096xf32>
  util.global private @__auto.blk.26.attn_q.weight = #stream.parameter.named<"model"::"blk.26.attn_q.weight"> : tensor<4096x4096xf16>
  util.global private @__auto.blk.26.attn_k.weight = #stream.parameter.named<"model"::"blk.26.attn_k.weight"> : tensor<1024x4096xf16>
  util.global private @__auto.blk.26.attn_v.weight = #stream.parameter.named<"model"::"blk.26.attn_v.weight"> : tensor<1024x4096xf16>
  util.global private @__auto.blk.26.attn_output.weight = #stream.parameter.named<"model"::"blk.26.attn_output.weight"> : tensor<4096x4096xf16>
  util.global private @__auto.blk.26.ffn_norm.weight = #stream.parameter.named<"model"::"blk.26.ffn_norm.weight"> : tensor<4096xf32>
  util.global private @__auto.blk.26.ffn_gate.weight = #stream.parameter.named<"model"::"blk.26.ffn_gate.weight"> : tensor<14336x4096xf16>
  util.global private @__auto.blk.26.ffn_up.weight = #stream.parameter.named<"model"::"blk.26.ffn_up.weight"> : tensor<14336x4096xf16>
  util.global private @__auto.blk.26.ffn_down.weight = #stream.parameter.named<"model"::"blk.26.ffn_down.weight"> : tensor<4096x14336xf16>
  util.global private @__auto.blk.27.attn_norm.weight = #stream.parameter.named<"model"::"blk.27.attn_norm.weight"> : tensor<4096xf32>
  util.global private @__auto.blk.27.attn_q.weight = #stream.parameter.named<"model"::"blk.27.attn_q.weight"> : tensor<4096x4096xf16>
  util.global private @__auto.blk.27.attn_k.weight = #stream.parameter.named<"model"::"blk.27.attn_k.weight"> : tensor<1024x4096xf16>
  util.global private @__auto.blk.27.attn_v.weight = #stream.parameter.named<"model"::"blk.27.attn_v.weight"> : tensor<1024x4096xf16>
  util.global private @__auto.blk.27.attn_output.weight = #stream.parameter.named<"model"::"blk.27.attn_output.weight"> : tensor<4096x4096xf16>
  util.global private @__auto.blk.27.ffn_norm.weight = #stream.parameter.named<"model"::"blk.27.ffn_norm.weight"> : tensor<4096xf32>
  util.global private @__auto.blk.27.ffn_gate.weight = #stream.parameter.named<"model"::"blk.27.ffn_gate.weight"> : tensor<14336x4096xf16>
  util.global private @__auto.blk.27.ffn_up.weight = #stream.parameter.named<"model"::"blk.27.ffn_up.weight"> : tensor<14336x4096xf16>
  util.global private @__auto.blk.27.ffn_down.weight = #stream.parameter.named<"model"::"blk.27.ffn_down.weight"> : tensor<4096x14336xf16>
  util.global private @__auto.blk.28.attn_norm.weight = #stream.parameter.named<"model"::"blk.28.attn_norm.weight"> : tensor<4096xf32>
  util.global private @__auto.blk.28.attn_q.weight = #stream.parameter.named<"model"::"blk.28.attn_q.weight"> : tensor<4096x4096xf16>
  util.global private @__auto.blk.28.attn_k.weight = #stream.parameter.named<"model"::"blk.28.attn_k.weight"> : tensor<1024x4096xf16>
  util.global private @__auto.blk.28.attn_v.weight = #stream.parameter.named<"model"::"blk.28.attn_v.weight"> : tensor<1024x4096xf16>
  util.global private @__auto.blk.28.attn_output.weight = #stream.parameter.named<"model"::"blk.28.attn_output.weight"> : tensor<4096x4096xf16>
  util.global private @__auto.blk.28.ffn_norm.weight = #stream.parameter.named<"model"::"blk.28.ffn_norm.weight"> : tensor<4096xf32>
  util.global private @__auto.blk.28.ffn_gate.weight = #stream.parameter.named<"model"::"blk.28.ffn_gate.weight"> : tensor<14336x4096xf16>
  util.global private @__auto.blk.28.ffn_up.weight = #stream.parameter.named<"model"::"blk.28.ffn_up.weight"> : tensor<14336x4096xf16>
  util.global private @__auto.blk.28.ffn_down.weight = #stream.parameter.named<"model"::"blk.28.ffn_down.weight"> : tensor<4096x14336xf16>
  util.global private @__auto.blk.29.attn_norm.weight = #stream.parameter.named<"model"::"blk.29.attn_norm.weight"> : tensor<4096xf32>
  util.global private @__auto.blk.29.attn_q.weight = #stream.parameter.named<"model"::"blk.29.attn_q.weight"> : tensor<4096x4096xf16>
  util.global private @__auto.blk.29.attn_k.weight = #stream.parameter.named<"model"::"blk.29.attn_k.weight"> : tensor<1024x4096xf16>
  util.global private @__auto.blk.29.attn_v.weight = #stream.parameter.named<"model"::"blk.29.attn_v.weight"> : tensor<1024x4096xf16>
  util.global private @__auto.blk.29.attn_output.weight = #stream.parameter.named<"model"::"blk.29.attn_output.weight"> : tensor<4096x4096xf16>
  util.global private @__auto.blk.29.ffn_norm.weight = #stream.parameter.named<"model"::"blk.29.ffn_norm.weight"> : tensor<4096xf32>
  util.global private @__auto.blk.29.ffn_gate.weight = #stream.parameter.named<"model"::"blk.29.ffn_gate.weight"> : tensor<14336x4096xf16>
  util.global private @__auto.blk.29.ffn_up.weight = #stream.parameter.named<"model"::"blk.29.ffn_up.weight"> : tensor<14336x4096xf16>
  util.global private @__auto.blk.29.ffn_down.weight = #stream.parameter.named<"model"::"blk.29.ffn_down.weight"> : tensor<4096x14336xf16>
  util.global private @__auto.blk.30.attn_norm.weight = #stream.parameter.named<"model"::"blk.30.attn_norm.weight"> : tensor<4096xf32>
  util.global private @__auto.blk.30.attn_q.weight = #stream.parameter.named<"model"::"blk.30.attn_q.weight"> : tensor<4096x4096xf16>
  util.global private @__auto.blk.30.attn_k.weight = #stream.parameter.named<"model"::"blk.30.attn_k.weight"> : tensor<1024x4096xf16>
  util.global private @__auto.blk.30.attn_v.weight = #stream.parameter.named<"model"::"blk.30.attn_v.weight"> : tensor<1024x4096xf16>
  util.global private @__auto.blk.30.attn_output.weight = #stream.parameter.named<"model"::"blk.30.attn_output.weight"> : tensor<4096x4096xf16>
  util.global private @__auto.blk.30.ffn_norm.weight = #stream.parameter.named<"model"::"blk.30.ffn_norm.weight"> : tensor<4096xf32>
  util.global private @__auto.blk.30.ffn_gate.weight = #stream.parameter.named<"model"::"blk.30.ffn_gate.weight"> : tensor<14336x4096xf16>
  util.global private @__auto.blk.30.ffn_up.weight = #stream.parameter.named<"model"::"blk.30.ffn_up.weight"> : tensor<14336x4096xf16>
  util.global private @__auto.blk.30.ffn_down.weight = #stream.parameter.named<"model"::"blk.30.ffn_down.weight"> : tensor<4096x14336xf16>
  util.global private @__auto.blk.31.attn_norm.weight = #stream.parameter.named<"model"::"blk.31.attn_norm.weight"> : tensor<4096xf32>
  util.global private @__auto.blk.31.attn_q.weight = #stream.parameter.named<"model"::"blk.31.attn_q.weight"> : tensor<4096x4096xf16>
  util.global private @__auto.blk.31.attn_k.weight = #stream.parameter.named<"model"::"blk.31.attn_k.weight"> : tensor<1024x4096xf16>
  util.global private @__auto.blk.31.attn_v.weight = #stream.parameter.named<"model"::"blk.31.attn_v.weight"> : tensor<1024x4096xf16>
  util.global private @__auto.blk.31.attn_output.weight = #stream.parameter.named<"model"::"blk.31.attn_output.weight"> : tensor<4096x4096xf16>
  util.global private @__auto.blk.31.ffn_norm.weight = #stream.parameter.named<"model"::"blk.31.ffn_norm.weight"> : tensor<4096xf32>
  util.global private @__auto.blk.31.ffn_gate.weight = #stream.parameter.named<"model"::"blk.31.ffn_gate.weight"> : tensor<14336x4096xf16>
  util.global private @__auto.blk.31.ffn_up.weight = #stream.parameter.named<"model"::"blk.31.ffn_up.weight"> : tensor<14336x4096xf16>
  util.global private @__auto.blk.31.ffn_down.weight = #stream.parameter.named<"model"::"blk.31.ffn_down.weight"> : tensor<4096x14336xf16>
  util.global private @__auto.output_norm.weight = #stream.parameter.named<"model"::"output_norm.weight"> : tensor<4096xf32>
  util.global private @__auto.output.weight = #stream.parameter.named<"model"::"output.weight"> : tensor<128256x4096xf16>
  util.global private @__auto.constant_1_1_8192_8192_torch.bool = dense_resource<__auto.constant_1_1_8192_8192_torch.bool> : tensor<1x1x8192x8192xi1>
  func.func @prefill_bs4(%arg0: !torch.vtensor<[4,?],si64>, %arg1: !torch.vtensor<[4],si64>, %arg2: !torch.vtensor<[4,?],si64>, %arg3: !torch.tensor<[?,1048576],f32>) -> !torch.vtensor<[4,?,4096],f32> {
    %__auto.token_embd.weight = util.global.load @__auto.token_embd.weight : tensor<128256x4096xf16>
    %0 = torch_c.from_builtin_tensor %__auto.token_embd.weight : tensor<128256x4096xf16> -> !torch.vtensor<[128256,4096],f16>
    %__auto.blk.0.attn_norm.weight = util.global.load @__auto.blk.0.attn_norm.weight : tensor<4096xf32>
    %1 = torch_c.from_builtin_tensor %__auto.blk.0.attn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
    %__auto.blk.0.attn_q.weight = util.global.load @__auto.blk.0.attn_q.weight : tensor<4096x4096xf16>
    %2 = torch_c.from_builtin_tensor %__auto.blk.0.attn_q.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
    %__auto.blk.0.attn_k.weight = util.global.load @__auto.blk.0.attn_k.weight : tensor<1024x4096xf16>
    %3 = torch_c.from_builtin_tensor %__auto.blk.0.attn_k.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
    %__auto.blk.0.attn_v.weight = util.global.load @__auto.blk.0.attn_v.weight : tensor<1024x4096xf16>
    %4 = torch_c.from_builtin_tensor %__auto.blk.0.attn_v.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
    %__auto.constant_8192_64_torch.complex64 = util.global.load @__auto.constant_8192_64_torch.complex64 : tensor<8192x64xcomplex<f32>>
    %5 = torch_c.from_builtin_tensor %__auto.constant_8192_64_torch.complex64 : tensor<8192x64xcomplex<f32>> -> !torch.vtensor<[8192,64],complex<f32>>
    %__auto.blk.0.attn_output.weight = util.global.load @__auto.blk.0.attn_output.weight : tensor<4096x4096xf16>
    %6 = torch_c.from_builtin_tensor %__auto.blk.0.attn_output.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
    %__auto.blk.0.ffn_norm.weight = util.global.load @__auto.blk.0.ffn_norm.weight : tensor<4096xf32>
    %7 = torch_c.from_builtin_tensor %__auto.blk.0.ffn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
    %__auto.blk.0.ffn_gate.weight = util.global.load @__auto.blk.0.ffn_gate.weight : tensor<14336x4096xf16>
    %8 = torch_c.from_builtin_tensor %__auto.blk.0.ffn_gate.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
    %__auto.blk.0.ffn_up.weight = util.global.load @__auto.blk.0.ffn_up.weight : tensor<14336x4096xf16>
    %9 = torch_c.from_builtin_tensor %__auto.blk.0.ffn_up.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
    %__auto.blk.0.ffn_down.weight = util.global.load @__auto.blk.0.ffn_down.weight : tensor<4096x14336xf16>
    %10 = torch_c.from_builtin_tensor %__auto.blk.0.ffn_down.weight : tensor<4096x14336xf16> -> !torch.vtensor<[4096,14336],f16>
    %__auto.blk.1.attn_norm.weight = util.global.load @__auto.blk.1.attn_norm.weight : tensor<4096xf32>
    %11 = torch_c.from_builtin_tensor %__auto.blk.1.attn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
    %__auto.blk.1.attn_q.weight = util.global.load @__auto.blk.1.attn_q.weight : tensor<4096x4096xf16>
    %12 = torch_c.from_builtin_tensor %__auto.blk.1.attn_q.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
    %__auto.blk.1.attn_k.weight = util.global.load @__auto.blk.1.attn_k.weight : tensor<1024x4096xf16>
    %13 = torch_c.from_builtin_tensor %__auto.blk.1.attn_k.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
    %__auto.blk.1.attn_v.weight = util.global.load @__auto.blk.1.attn_v.weight : tensor<1024x4096xf16>
    %14 = torch_c.from_builtin_tensor %__auto.blk.1.attn_v.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
    %__auto.blk.1.attn_output.weight = util.global.load @__auto.blk.1.attn_output.weight : tensor<4096x4096xf16>
    %15 = torch_c.from_builtin_tensor %__auto.blk.1.attn_output.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
    %__auto.blk.1.ffn_norm.weight = util.global.load @__auto.blk.1.ffn_norm.weight : tensor<4096xf32>
    %16 = torch_c.from_builtin_tensor %__auto.blk.1.ffn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
    %__auto.blk.1.ffn_gate.weight = util.global.load @__auto.blk.1.ffn_gate.weight : tensor<14336x4096xf16>
    %17 = torch_c.from_builtin_tensor %__auto.blk.1.ffn_gate.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
    %__auto.blk.1.ffn_up.weight = util.global.load @__auto.blk.1.ffn_up.weight : tensor<14336x4096xf16>
    %18 = torch_c.from_builtin_tensor %__auto.blk.1.ffn_up.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
    %__auto.blk.1.ffn_down.weight = util.global.load @__auto.blk.1.ffn_down.weight : tensor<4096x14336xf16>
    %19 = torch_c.from_builtin_tensor %__auto.blk.1.ffn_down.weight : tensor<4096x14336xf16> -> !torch.vtensor<[4096,14336],f16>
    %__auto.blk.2.attn_norm.weight = util.global.load @__auto.blk.2.attn_norm.weight : tensor<4096xf32>
    %20 = torch_c.from_builtin_tensor %__auto.blk.2.attn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
    %__auto.blk.2.attn_q.weight = util.global.load @__auto.blk.2.attn_q.weight : tensor<4096x4096xf16>
    %21 = torch_c.from_builtin_tensor %__auto.blk.2.attn_q.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
    %__auto.blk.2.attn_k.weight = util.global.load @__auto.blk.2.attn_k.weight : tensor<1024x4096xf16>
    %22 = torch_c.from_builtin_tensor %__auto.blk.2.attn_k.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
    %__auto.blk.2.attn_v.weight = util.global.load @__auto.blk.2.attn_v.weight : tensor<1024x4096xf16>
    %23 = torch_c.from_builtin_tensor %__auto.blk.2.attn_v.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
    %__auto.blk.2.attn_output.weight = util.global.load @__auto.blk.2.attn_output.weight : tensor<4096x4096xf16>
    %24 = torch_c.from_builtin_tensor %__auto.blk.2.attn_output.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
    %__auto.blk.2.ffn_norm.weight = util.global.load @__auto.blk.2.ffn_norm.weight : tensor<4096xf32>
    %25 = torch_c.from_builtin_tensor %__auto.blk.2.ffn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
    %__auto.blk.2.ffn_gate.weight = util.global.load @__auto.blk.2.ffn_gate.weight : tensor<14336x4096xf16>
    %26 = torch_c.from_builtin_tensor %__auto.blk.2.ffn_gate.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
    %__auto.blk.2.ffn_up.weight = util.global.load @__auto.blk.2.ffn_up.weight : tensor<14336x4096xf16>
    %27 = torch_c.from_builtin_tensor %__auto.blk.2.ffn_up.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
    %__auto.blk.2.ffn_down.weight = util.global.load @__auto.blk.2.ffn_down.weight : tensor<4096x14336xf16>
    %28 = torch_c.from_builtin_tensor %__auto.blk.2.ffn_down.weight : tensor<4096x14336xf16> -> !torch.vtensor<[4096,14336],f16>
    %__auto.blk.3.attn_norm.weight = util.global.load @__auto.blk.3.attn_norm.weight : tensor<4096xf32>
    %29 = torch_c.from_builtin_tensor %__auto.blk.3.attn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
    %__auto.blk.3.attn_q.weight = util.global.load @__auto.blk.3.attn_q.weight : tensor<4096x4096xf16>
    %30 = torch_c.from_builtin_tensor %__auto.blk.3.attn_q.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
    %__auto.blk.3.attn_k.weight = util.global.load @__auto.blk.3.attn_k.weight : tensor<1024x4096xf16>
    %31 = torch_c.from_builtin_tensor %__auto.blk.3.attn_k.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
    %__auto.blk.3.attn_v.weight = util.global.load @__auto.blk.3.attn_v.weight : tensor<1024x4096xf16>
    %32 = torch_c.from_builtin_tensor %__auto.blk.3.attn_v.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
    %__auto.blk.3.attn_output.weight = util.global.load @__auto.blk.3.attn_output.weight : tensor<4096x4096xf16>
    %33 = torch_c.from_builtin_tensor %__auto.blk.3.attn_output.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
    %__auto.blk.3.ffn_norm.weight = util.global.load @__auto.blk.3.ffn_norm.weight : tensor<4096xf32>
    %34 = torch_c.from_builtin_tensor %__auto.blk.3.ffn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
    %__auto.blk.3.ffn_gate.weight = util.global.load @__auto.blk.3.ffn_gate.weight : tensor<14336x4096xf16>
    %35 = torch_c.from_builtin_tensor %__auto.blk.3.ffn_gate.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
    %__auto.blk.3.ffn_up.weight = util.global.load @__auto.blk.3.ffn_up.weight : tensor<14336x4096xf16>
    %36 = torch_c.from_builtin_tensor %__auto.blk.3.ffn_up.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
    %__auto.blk.3.ffn_down.weight = util.global.load @__auto.blk.3.ffn_down.weight : tensor<4096x14336xf16>
    %37 = torch_c.from_builtin_tensor %__auto.blk.3.ffn_down.weight : tensor<4096x14336xf16> -> !torch.vtensor<[4096,14336],f16>
    %__auto.blk.4.attn_norm.weight = util.global.load @__auto.blk.4.attn_norm.weight : tensor<4096xf32>
    %38 = torch_c.from_builtin_tensor %__auto.blk.4.attn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
    %__auto.blk.4.attn_q.weight = util.global.load @__auto.blk.4.attn_q.weight : tensor<4096x4096xf16>
    %39 = torch_c.from_builtin_tensor %__auto.blk.4.attn_q.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
    %__auto.blk.4.attn_k.weight = util.global.load @__auto.blk.4.attn_k.weight : tensor<1024x4096xf16>
    %40 = torch_c.from_builtin_tensor %__auto.blk.4.attn_k.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
    %__auto.blk.4.attn_v.weight = util.global.load @__auto.blk.4.attn_v.weight : tensor<1024x4096xf16>
    %41 = torch_c.from_builtin_tensor %__auto.blk.4.attn_v.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
    %__auto.blk.4.attn_output.weight = util.global.load @__auto.blk.4.attn_output.weight : tensor<4096x4096xf16>
    %42 = torch_c.from_builtin_tensor %__auto.blk.4.attn_output.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
    %__auto.blk.4.ffn_norm.weight = util.global.load @__auto.blk.4.ffn_norm.weight : tensor<4096xf32>
    %43 = torch_c.from_builtin_tensor %__auto.blk.4.ffn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
    %__auto.blk.4.ffn_gate.weight = util.global.load @__auto.blk.4.ffn_gate.weight : tensor<14336x4096xf16>
    %44 = torch_c.from_builtin_tensor %__auto.blk.4.ffn_gate.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
    %__auto.blk.4.ffn_up.weight = util.global.load @__auto.blk.4.ffn_up.weight : tensor<14336x4096xf16>
    %45 = torch_c.from_builtin_tensor %__auto.blk.4.ffn_up.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
    %__auto.blk.4.ffn_down.weight = util.global.load @__auto.blk.4.ffn_down.weight : tensor<4096x14336xf16>
    %46 = torch_c.from_builtin_tensor %__auto.blk.4.ffn_down.weight : tensor<4096x14336xf16> -> !torch.vtensor<[4096,14336],f16>
    %__auto.blk.5.attn_norm.weight = util.global.load @__auto.blk.5.attn_norm.weight : tensor<4096xf32>
    %47 = torch_c.from_builtin_tensor %__auto.blk.5.attn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
    %__auto.blk.5.attn_q.weight = util.global.load @__auto.blk.5.attn_q.weight : tensor<4096x4096xf16>
    %48 = torch_c.from_builtin_tensor %__auto.blk.5.attn_q.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
    %__auto.blk.5.attn_k.weight = util.global.load @__auto.blk.5.attn_k.weight : tensor<1024x4096xf16>
    %49 = torch_c.from_builtin_tensor %__auto.blk.5.attn_k.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
    %__auto.blk.5.attn_v.weight = util.global.load @__auto.blk.5.attn_v.weight : tensor<1024x4096xf16>
    %50 = torch_c.from_builtin_tensor %__auto.blk.5.attn_v.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
    %__auto.blk.5.attn_output.weight = util.global.load @__auto.blk.5.attn_output.weight : tensor<4096x4096xf16>
    %51 = torch_c.from_builtin_tensor %__auto.blk.5.attn_output.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
    %__auto.blk.5.ffn_norm.weight = util.global.load @__auto.blk.5.ffn_norm.weight : tensor<4096xf32>
    %52 = torch_c.from_builtin_tensor %__auto.blk.5.ffn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
    %__auto.blk.5.ffn_gate.weight = util.global.load @__auto.blk.5.ffn_gate.weight : tensor<14336x4096xf16>
    %53 = torch_c.from_builtin_tensor %__auto.blk.5.ffn_gate.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
    %__auto.blk.5.ffn_up.weight = util.global.load @__auto.blk.5.ffn_up.weight : tensor<14336x4096xf16>
    %54 = torch_c.from_builtin_tensor %__auto.blk.5.ffn_up.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
    %__auto.blk.5.ffn_down.weight = util.global.load @__auto.blk.5.ffn_down.weight : tensor<4096x14336xf16>
    %55 = torch_c.from_builtin_tensor %__auto.blk.5.ffn_down.weight : tensor<4096x14336xf16> -> !torch.vtensor<[4096,14336],f16>
    %__auto.blk.6.attn_norm.weight = util.global.load @__auto.blk.6.attn_norm.weight : tensor<4096xf32>
    %56 = torch_c.from_builtin_tensor %__auto.blk.6.attn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
    %__auto.blk.6.attn_q.weight = util.global.load @__auto.blk.6.attn_q.weight : tensor<4096x4096xf16>
    %57 = torch_c.from_builtin_tensor %__auto.blk.6.attn_q.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
    %__auto.blk.6.attn_k.weight = util.global.load @__auto.blk.6.attn_k.weight : tensor<1024x4096xf16>
    %58 = torch_c.from_builtin_tensor %__auto.blk.6.attn_k.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
    %__auto.blk.6.attn_v.weight = util.global.load @__auto.blk.6.attn_v.weight : tensor<1024x4096xf16>
    %59 = torch_c.from_builtin_tensor %__auto.blk.6.attn_v.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
    %__auto.blk.6.attn_output.weight = util.global.load @__auto.blk.6.attn_output.weight : tensor<4096x4096xf16>
    %60 = torch_c.from_builtin_tensor %__auto.blk.6.attn_output.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
    %__auto.blk.6.ffn_norm.weight = util.global.load @__auto.blk.6.ffn_norm.weight : tensor<4096xf32>
    %61 = torch_c.from_builtin_tensor %__auto.blk.6.ffn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
    %__auto.blk.6.ffn_gate.weight = util.global.load @__auto.blk.6.ffn_gate.weight : tensor<14336x4096xf16>
    %62 = torch_c.from_builtin_tensor %__auto.blk.6.ffn_gate.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
    %__auto.blk.6.ffn_up.weight = util.global.load @__auto.blk.6.ffn_up.weight : tensor<14336x4096xf16>
    %63 = torch_c.from_builtin_tensor %__auto.blk.6.ffn_up.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
    %__auto.blk.6.ffn_down.weight = util.global.load @__auto.blk.6.ffn_down.weight : tensor<4096x14336xf16>
    %64 = torch_c.from_builtin_tensor %__auto.blk.6.ffn_down.weight : tensor<4096x14336xf16> -> !torch.vtensor<[4096,14336],f16>
    %__auto.blk.7.attn_norm.weight = util.global.load @__auto.blk.7.attn_norm.weight : tensor<4096xf32>
    %65 = torch_c.from_builtin_tensor %__auto.blk.7.attn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
    %__auto.blk.7.attn_q.weight = util.global.load @__auto.blk.7.attn_q.weight : tensor<4096x4096xf16>
    %66 = torch_c.from_builtin_tensor %__auto.blk.7.attn_q.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
    %__auto.blk.7.attn_k.weight = util.global.load @__auto.blk.7.attn_k.weight : tensor<1024x4096xf16>
    %67 = torch_c.from_builtin_tensor %__auto.blk.7.attn_k.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
    %__auto.blk.7.attn_v.weight = util.global.load @__auto.blk.7.attn_v.weight : tensor<1024x4096xf16>
    %68 = torch_c.from_builtin_tensor %__auto.blk.7.attn_v.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
    %__auto.blk.7.attn_output.weight = util.global.load @__auto.blk.7.attn_output.weight : tensor<4096x4096xf16>
    %69 = torch_c.from_builtin_tensor %__auto.blk.7.attn_output.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
    %__auto.blk.7.ffn_norm.weight = util.global.load @__auto.blk.7.ffn_norm.weight : tensor<4096xf32>
    %70 = torch_c.from_builtin_tensor %__auto.blk.7.ffn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
    %__auto.blk.7.ffn_gate.weight = util.global.load @__auto.blk.7.ffn_gate.weight : tensor<14336x4096xf16>
    %71 = torch_c.from_builtin_tensor %__auto.blk.7.ffn_gate.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
    %__auto.blk.7.ffn_up.weight = util.global.load @__auto.blk.7.ffn_up.weight : tensor<14336x4096xf16>
    %72 = torch_c.from_builtin_tensor %__auto.blk.7.ffn_up.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
    %__auto.blk.7.ffn_down.weight = util.global.load @__auto.blk.7.ffn_down.weight : tensor<4096x14336xf16>
    %73 = torch_c.from_builtin_tensor %__auto.blk.7.ffn_down.weight : tensor<4096x14336xf16> -> !torch.vtensor<[4096,14336],f16>
    %__auto.blk.8.attn_norm.weight = util.global.load @__auto.blk.8.attn_norm.weight : tensor<4096xf32>
    %74 = torch_c.from_builtin_tensor %__auto.blk.8.attn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
    %__auto.blk.8.attn_q.weight = util.global.load @__auto.blk.8.attn_q.weight : tensor<4096x4096xf16>
    %75 = torch_c.from_builtin_tensor %__auto.blk.8.attn_q.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
    %__auto.blk.8.attn_k.weight = util.global.load @__auto.blk.8.attn_k.weight : tensor<1024x4096xf16>
    %76 = torch_c.from_builtin_tensor %__auto.blk.8.attn_k.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
    %__auto.blk.8.attn_v.weight = util.global.load @__auto.blk.8.attn_v.weight : tensor<1024x4096xf16>
    %77 = torch_c.from_builtin_tensor %__auto.blk.8.attn_v.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
    %__auto.blk.8.attn_output.weight = util.global.load @__auto.blk.8.attn_output.weight : tensor<4096x4096xf16>
    %78 = torch_c.from_builtin_tensor %__auto.blk.8.attn_output.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
    %__auto.blk.8.ffn_norm.weight = util.global.load @__auto.blk.8.ffn_norm.weight : tensor<4096xf32>
    %79 = torch_c.from_builtin_tensor %__auto.blk.8.ffn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
    %__auto.blk.8.ffn_gate.weight = util.global.load @__auto.blk.8.ffn_gate.weight : tensor<14336x4096xf16>
    %80 = torch_c.from_builtin_tensor %__auto.blk.8.ffn_gate.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
    %__auto.blk.8.ffn_up.weight = util.global.load @__auto.blk.8.ffn_up.weight : tensor<14336x4096xf16>
    %81 = torch_c.from_builtin_tensor %__auto.blk.8.ffn_up.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
    %__auto.blk.8.ffn_down.weight = util.global.load @__auto.blk.8.ffn_down.weight : tensor<4096x14336xf16>
    %82 = torch_c.from_builtin_tensor %__auto.blk.8.ffn_down.weight : tensor<4096x14336xf16> -> !torch.vtensor<[4096,14336],f16>
    %__auto.blk.9.attn_norm.weight = util.global.load @__auto.blk.9.attn_norm.weight : tensor<4096xf32>
    %83 = torch_c.from_builtin_tensor %__auto.blk.9.attn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
    %__auto.blk.9.attn_q.weight = util.global.load @__auto.blk.9.attn_q.weight : tensor<4096x4096xf16>
    %84 = torch_c.from_builtin_tensor %__auto.blk.9.attn_q.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
    %__auto.blk.9.attn_k.weight = util.global.load @__auto.blk.9.attn_k.weight : tensor<1024x4096xf16>
    %85 = torch_c.from_builtin_tensor %__auto.blk.9.attn_k.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
    %__auto.blk.9.attn_v.weight = util.global.load @__auto.blk.9.attn_v.weight : tensor<1024x4096xf16>
    %86 = torch_c.from_builtin_tensor %__auto.blk.9.attn_v.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
    %__auto.blk.9.attn_output.weight = util.global.load @__auto.blk.9.attn_output.weight : tensor<4096x4096xf16>
    %87 = torch_c.from_builtin_tensor %__auto.blk.9.attn_output.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
    %__auto.blk.9.ffn_norm.weight = util.global.load @__auto.blk.9.ffn_norm.weight : tensor<4096xf32>
    %88 = torch_c.from_builtin_tensor %__auto.blk.9.ffn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
    %__auto.blk.9.ffn_gate.weight = util.global.load @__auto.blk.9.ffn_gate.weight : tensor<14336x4096xf16>
    %89 = torch_c.from_builtin_tensor %__auto.blk.9.ffn_gate.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
    %__auto.blk.9.ffn_up.weight = util.global.load @__auto.blk.9.ffn_up.weight : tensor<14336x4096xf16>
    %90 = torch_c.from_builtin_tensor %__auto.blk.9.ffn_up.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
    %__auto.blk.9.ffn_down.weight = util.global.load @__auto.blk.9.ffn_down.weight : tensor<4096x14336xf16>
    %91 = torch_c.from_builtin_tensor %__auto.blk.9.ffn_down.weight : tensor<4096x14336xf16> -> !torch.vtensor<[4096,14336],f16>
    %__auto.blk.10.attn_norm.weight = util.global.load @__auto.blk.10.attn_norm.weight : tensor<4096xf32>
    %92 = torch_c.from_builtin_tensor %__auto.blk.10.attn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
    %__auto.blk.10.attn_q.weight = util.global.load @__auto.blk.10.attn_q.weight : tensor<4096x4096xf16>
    %93 = torch_c.from_builtin_tensor %__auto.blk.10.attn_q.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
    %__auto.blk.10.attn_k.weight = util.global.load @__auto.blk.10.attn_k.weight : tensor<1024x4096xf16>
    %94 = torch_c.from_builtin_tensor %__auto.blk.10.attn_k.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
    %__auto.blk.10.attn_v.weight = util.global.load @__auto.blk.10.attn_v.weight : tensor<1024x4096xf16>
    %95 = torch_c.from_builtin_tensor %__auto.blk.10.attn_v.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
    %__auto.blk.10.attn_output.weight = util.global.load @__auto.blk.10.attn_output.weight : tensor<4096x4096xf16>
    %96 = torch_c.from_builtin_tensor %__auto.blk.10.attn_output.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
    %__auto.blk.10.ffn_norm.weight = util.global.load @__auto.blk.10.ffn_norm.weight : tensor<4096xf32>
    %97 = torch_c.from_builtin_tensor %__auto.blk.10.ffn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
    %__auto.blk.10.ffn_gate.weight = util.global.load @__auto.blk.10.ffn_gate.weight : tensor<14336x4096xf16>
    %98 = torch_c.from_builtin_tensor %__auto.blk.10.ffn_gate.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
    %__auto.blk.10.ffn_up.weight = util.global.load @__auto.blk.10.ffn_up.weight : tensor<14336x4096xf16>
    %99 = torch_c.from_builtin_tensor %__auto.blk.10.ffn_up.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
    %__auto.blk.10.ffn_down.weight = util.global.load @__auto.blk.10.ffn_down.weight : tensor<4096x14336xf16>
    %100 = torch_c.from_builtin_tensor %__auto.blk.10.ffn_down.weight : tensor<4096x14336xf16> -> !torch.vtensor<[4096,14336],f16>
    %__auto.blk.11.attn_norm.weight = util.global.load @__auto.blk.11.attn_norm.weight : tensor<4096xf32>
    %101 = torch_c.from_builtin_tensor %__auto.blk.11.attn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
    %__auto.blk.11.attn_q.weight = util.global.load @__auto.blk.11.attn_q.weight : tensor<4096x4096xf16>
    %102 = torch_c.from_builtin_tensor %__auto.blk.11.attn_q.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
    %__auto.blk.11.attn_k.weight = util.global.load @__auto.blk.11.attn_k.weight : tensor<1024x4096xf16>
    %103 = torch_c.from_builtin_tensor %__auto.blk.11.attn_k.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
    %__auto.blk.11.attn_v.weight = util.global.load @__auto.blk.11.attn_v.weight : tensor<1024x4096xf16>
    %104 = torch_c.from_builtin_tensor %__auto.blk.11.attn_v.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
    %__auto.blk.11.attn_output.weight = util.global.load @__auto.blk.11.attn_output.weight : tensor<4096x4096xf16>
    %105 = torch_c.from_builtin_tensor %__auto.blk.11.attn_output.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
    %__auto.blk.11.ffn_norm.weight = util.global.load @__auto.blk.11.ffn_norm.weight : tensor<4096xf32>
    %106 = torch_c.from_builtin_tensor %__auto.blk.11.ffn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
    %__auto.blk.11.ffn_gate.weight = util.global.load @__auto.blk.11.ffn_gate.weight : tensor<14336x4096xf16>
    %107 = torch_c.from_builtin_tensor %__auto.blk.11.ffn_gate.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
    %__auto.blk.11.ffn_up.weight = util.global.load @__auto.blk.11.ffn_up.weight : tensor<14336x4096xf16>
    %108 = torch_c.from_builtin_tensor %__auto.blk.11.ffn_up.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
    %__auto.blk.11.ffn_down.weight = util.global.load @__auto.blk.11.ffn_down.weight : tensor<4096x14336xf16>
    %109 = torch_c.from_builtin_tensor %__auto.blk.11.ffn_down.weight : tensor<4096x14336xf16> -> !torch.vtensor<[4096,14336],f16>
    %__auto.blk.12.attn_norm.weight = util.global.load @__auto.blk.12.attn_norm.weight : tensor<4096xf32>
    %110 = torch_c.from_builtin_tensor %__auto.blk.12.attn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
    %__auto.blk.12.attn_q.weight = util.global.load @__auto.blk.12.attn_q.weight : tensor<4096x4096xf16>
    %111 = torch_c.from_builtin_tensor %__auto.blk.12.attn_q.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
    %__auto.blk.12.attn_k.weight = util.global.load @__auto.blk.12.attn_k.weight : tensor<1024x4096xf16>
    %112 = torch_c.from_builtin_tensor %__auto.blk.12.attn_k.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
    %__auto.blk.12.attn_v.weight = util.global.load @__auto.blk.12.attn_v.weight : tensor<1024x4096xf16>
    %113 = torch_c.from_builtin_tensor %__auto.blk.12.attn_v.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
    %__auto.blk.12.attn_output.weight = util.global.load @__auto.blk.12.attn_output.weight : tensor<4096x4096xf16>
    %114 = torch_c.from_builtin_tensor %__auto.blk.12.attn_output.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
    %__auto.blk.12.ffn_norm.weight = util.global.load @__auto.blk.12.ffn_norm.weight : tensor<4096xf32>
    %115 = torch_c.from_builtin_tensor %__auto.blk.12.ffn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
    %__auto.blk.12.ffn_gate.weight = util.global.load @__auto.blk.12.ffn_gate.weight : tensor<14336x4096xf16>
    %116 = torch_c.from_builtin_tensor %__auto.blk.12.ffn_gate.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
    %__auto.blk.12.ffn_up.weight = util.global.load @__auto.blk.12.ffn_up.weight : tensor<14336x4096xf16>
    %117 = torch_c.from_builtin_tensor %__auto.blk.12.ffn_up.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
    %__auto.blk.12.ffn_down.weight = util.global.load @__auto.blk.12.ffn_down.weight : tensor<4096x14336xf16>
    %118 = torch_c.from_builtin_tensor %__auto.blk.12.ffn_down.weight : tensor<4096x14336xf16> -> !torch.vtensor<[4096,14336],f16>
    %__auto.blk.13.attn_norm.weight = util.global.load @__auto.blk.13.attn_norm.weight : tensor<4096xf32>
    %119 = torch_c.from_builtin_tensor %__auto.blk.13.attn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
    %__auto.blk.13.attn_q.weight = util.global.load @__auto.blk.13.attn_q.weight : tensor<4096x4096xf16>
    %120 = torch_c.from_builtin_tensor %__auto.blk.13.attn_q.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
    %__auto.blk.13.attn_k.weight = util.global.load @__auto.blk.13.attn_k.weight : tensor<1024x4096xf16>
    %121 = torch_c.from_builtin_tensor %__auto.blk.13.attn_k.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
    %__auto.blk.13.attn_v.weight = util.global.load @__auto.blk.13.attn_v.weight : tensor<1024x4096xf16>
    %122 = torch_c.from_builtin_tensor %__auto.blk.13.attn_v.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
    %__auto.blk.13.attn_output.weight = util.global.load @__auto.blk.13.attn_output.weight : tensor<4096x4096xf16>
    %123 = torch_c.from_builtin_tensor %__auto.blk.13.attn_output.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
    %__auto.blk.13.ffn_norm.weight = util.global.load @__auto.blk.13.ffn_norm.weight : tensor<4096xf32>
    %124 = torch_c.from_builtin_tensor %__auto.blk.13.ffn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
    %__auto.blk.13.ffn_gate.weight = util.global.load @__auto.blk.13.ffn_gate.weight : tensor<14336x4096xf16>
    %125 = torch_c.from_builtin_tensor %__auto.blk.13.ffn_gate.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
    %__auto.blk.13.ffn_up.weight = util.global.load @__auto.blk.13.ffn_up.weight : tensor<14336x4096xf16>
    %126 = torch_c.from_builtin_tensor %__auto.blk.13.ffn_up.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
    %__auto.blk.13.ffn_down.weight = util.global.load @__auto.blk.13.ffn_down.weight : tensor<4096x14336xf16>
    %127 = torch_c.from_builtin_tensor %__auto.blk.13.ffn_down.weight : tensor<4096x14336xf16> -> !torch.vtensor<[4096,14336],f16>
    %__auto.blk.14.attn_norm.weight = util.global.load @__auto.blk.14.attn_norm.weight : tensor<4096xf32>
    %128 = torch_c.from_builtin_tensor %__auto.blk.14.attn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
    %__auto.blk.14.attn_q.weight = util.global.load @__auto.blk.14.attn_q.weight : tensor<4096x4096xf16>
    %129 = torch_c.from_builtin_tensor %__auto.blk.14.attn_q.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
    %__auto.blk.14.attn_k.weight = util.global.load @__auto.blk.14.attn_k.weight : tensor<1024x4096xf16>
    %130 = torch_c.from_builtin_tensor %__auto.blk.14.attn_k.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
    %__auto.blk.14.attn_v.weight = util.global.load @__auto.blk.14.attn_v.weight : tensor<1024x4096xf16>
    %131 = torch_c.from_builtin_tensor %__auto.blk.14.attn_v.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
    %__auto.blk.14.attn_output.weight = util.global.load @__auto.blk.14.attn_output.weight : tensor<4096x4096xf16>
    %132 = torch_c.from_builtin_tensor %__auto.blk.14.attn_output.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
    %__auto.blk.14.ffn_norm.weight = util.global.load @__auto.blk.14.ffn_norm.weight : tensor<4096xf32>
    %133 = torch_c.from_builtin_tensor %__auto.blk.14.ffn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
    %__auto.blk.14.ffn_gate.weight = util.global.load @__auto.blk.14.ffn_gate.weight : tensor<14336x4096xf16>
    %134 = torch_c.from_builtin_tensor %__auto.blk.14.ffn_gate.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
    %__auto.blk.14.ffn_up.weight = util.global.load @__auto.blk.14.ffn_up.weight : tensor<14336x4096xf16>
    %135 = torch_c.from_builtin_tensor %__auto.blk.14.ffn_up.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
    %__auto.blk.14.ffn_down.weight = util.global.load @__auto.blk.14.ffn_down.weight : tensor<4096x14336xf16>
    %136 = torch_c.from_builtin_tensor %__auto.blk.14.ffn_down.weight : tensor<4096x14336xf16> -> !torch.vtensor<[4096,14336],f16>
    %__auto.blk.15.attn_norm.weight = util.global.load @__auto.blk.15.attn_norm.weight : tensor<4096xf32>
    %137 = torch_c.from_builtin_tensor %__auto.blk.15.attn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
    %__auto.blk.15.attn_q.weight = util.global.load @__auto.blk.15.attn_q.weight : tensor<4096x4096xf16>
    %138 = torch_c.from_builtin_tensor %__auto.blk.15.attn_q.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
    %__auto.blk.15.attn_k.weight = util.global.load @__auto.blk.15.attn_k.weight : tensor<1024x4096xf16>
    %139 = torch_c.from_builtin_tensor %__auto.blk.15.attn_k.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
    %__auto.blk.15.attn_v.weight = util.global.load @__auto.blk.15.attn_v.weight : tensor<1024x4096xf16>
    %140 = torch_c.from_builtin_tensor %__auto.blk.15.attn_v.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
    %__auto.blk.15.attn_output.weight = util.global.load @__auto.blk.15.attn_output.weight : tensor<4096x4096xf16>
    %141 = torch_c.from_builtin_tensor %__auto.blk.15.attn_output.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
    %__auto.blk.15.ffn_norm.weight = util.global.load @__auto.blk.15.ffn_norm.weight : tensor<4096xf32>
    %142 = torch_c.from_builtin_tensor %__auto.blk.15.ffn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
    %__auto.blk.15.ffn_gate.weight = util.global.load @__auto.blk.15.ffn_gate.weight : tensor<14336x4096xf16>
    %143 = torch_c.from_builtin_tensor %__auto.blk.15.ffn_gate.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
    %__auto.blk.15.ffn_up.weight = util.global.load @__auto.blk.15.ffn_up.weight : tensor<14336x4096xf16>
    %144 = torch_c.from_builtin_tensor %__auto.blk.15.ffn_up.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
    %__auto.blk.15.ffn_down.weight = util.global.load @__auto.blk.15.ffn_down.weight : tensor<4096x14336xf16>
    %145 = torch_c.from_builtin_tensor %__auto.blk.15.ffn_down.weight : tensor<4096x14336xf16> -> !torch.vtensor<[4096,14336],f16>
    %__auto.blk.16.attn_norm.weight = util.global.load @__auto.blk.16.attn_norm.weight : tensor<4096xf32>
    %146 = torch_c.from_builtin_tensor %__auto.blk.16.attn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
    %__auto.blk.16.attn_q.weight = util.global.load @__auto.blk.16.attn_q.weight : tensor<4096x4096xf16>
    %147 = torch_c.from_builtin_tensor %__auto.blk.16.attn_q.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
    %__auto.blk.16.attn_k.weight = util.global.load @__auto.blk.16.attn_k.weight : tensor<1024x4096xf16>
    %148 = torch_c.from_builtin_tensor %__auto.blk.16.attn_k.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
    %__auto.blk.16.attn_v.weight = util.global.load @__auto.blk.16.attn_v.weight : tensor<1024x4096xf16>
    %149 = torch_c.from_builtin_tensor %__auto.blk.16.attn_v.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
    %__auto.blk.16.attn_output.weight = util.global.load @__auto.blk.16.attn_output.weight : tensor<4096x4096xf16>
    %150 = torch_c.from_builtin_tensor %__auto.blk.16.attn_output.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
    %__auto.blk.16.ffn_norm.weight = util.global.load @__auto.blk.16.ffn_norm.weight : tensor<4096xf32>
    %151 = torch_c.from_builtin_tensor %__auto.blk.16.ffn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
    %__auto.blk.16.ffn_gate.weight = util.global.load @__auto.blk.16.ffn_gate.weight : tensor<14336x4096xf16>
    %152 = torch_c.from_builtin_tensor %__auto.blk.16.ffn_gate.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
    %__auto.blk.16.ffn_up.weight = util.global.load @__auto.blk.16.ffn_up.weight : tensor<14336x4096xf16>
    %153 = torch_c.from_builtin_tensor %__auto.blk.16.ffn_up.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
    %__auto.blk.16.ffn_down.weight = util.global.load @__auto.blk.16.ffn_down.weight : tensor<4096x14336xf16>
    %154 = torch_c.from_builtin_tensor %__auto.blk.16.ffn_down.weight : tensor<4096x14336xf16> -> !torch.vtensor<[4096,14336],f16>
    %__auto.blk.17.attn_norm.weight = util.global.load @__auto.blk.17.attn_norm.weight : tensor<4096xf32>
    %155 = torch_c.from_builtin_tensor %__auto.blk.17.attn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
    %__auto.blk.17.attn_q.weight = util.global.load @__auto.blk.17.attn_q.weight : tensor<4096x4096xf16>
    %156 = torch_c.from_builtin_tensor %__auto.blk.17.attn_q.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
    %__auto.blk.17.attn_k.weight = util.global.load @__auto.blk.17.attn_k.weight : tensor<1024x4096xf16>
    %157 = torch_c.from_builtin_tensor %__auto.blk.17.attn_k.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
    %__auto.blk.17.attn_v.weight = util.global.load @__auto.blk.17.attn_v.weight : tensor<1024x4096xf16>
    %158 = torch_c.from_builtin_tensor %__auto.blk.17.attn_v.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
    %__auto.blk.17.attn_output.weight = util.global.load @__auto.blk.17.attn_output.weight : tensor<4096x4096xf16>
    %159 = torch_c.from_builtin_tensor %__auto.blk.17.attn_output.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
    %__auto.blk.17.ffn_norm.weight = util.global.load @__auto.blk.17.ffn_norm.weight : tensor<4096xf32>
    %160 = torch_c.from_builtin_tensor %__auto.blk.17.ffn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
    %__auto.blk.17.ffn_gate.weight = util.global.load @__auto.blk.17.ffn_gate.weight : tensor<14336x4096xf16>
    %161 = torch_c.from_builtin_tensor %__auto.blk.17.ffn_gate.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
    %__auto.blk.17.ffn_up.weight = util.global.load @__auto.blk.17.ffn_up.weight : tensor<14336x4096xf16>
    %162 = torch_c.from_builtin_tensor %__auto.blk.17.ffn_up.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
    %__auto.blk.17.ffn_down.weight = util.global.load @__auto.blk.17.ffn_down.weight : tensor<4096x14336xf16>
    %163 = torch_c.from_builtin_tensor %__auto.blk.17.ffn_down.weight : tensor<4096x14336xf16> -> !torch.vtensor<[4096,14336],f16>
    %__auto.blk.18.attn_norm.weight = util.global.load @__auto.blk.18.attn_norm.weight : tensor<4096xf32>
    %164 = torch_c.from_builtin_tensor %__auto.blk.18.attn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
    %__auto.blk.18.attn_q.weight = util.global.load @__auto.blk.18.attn_q.weight : tensor<4096x4096xf16>
    %165 = torch_c.from_builtin_tensor %__auto.blk.18.attn_q.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
    %__auto.blk.18.attn_k.weight = util.global.load @__auto.blk.18.attn_k.weight : tensor<1024x4096xf16>
    %166 = torch_c.from_builtin_tensor %__auto.blk.18.attn_k.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
    %__auto.blk.18.attn_v.weight = util.global.load @__auto.blk.18.attn_v.weight : tensor<1024x4096xf16>
    %167 = torch_c.from_builtin_tensor %__auto.blk.18.attn_v.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
    %__auto.blk.18.attn_output.weight = util.global.load @__auto.blk.18.attn_output.weight : tensor<4096x4096xf16>
    %168 = torch_c.from_builtin_tensor %__auto.blk.18.attn_output.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
    %__auto.blk.18.ffn_norm.weight = util.global.load @__auto.blk.18.ffn_norm.weight : tensor<4096xf32>
    %169 = torch_c.from_builtin_tensor %__auto.blk.18.ffn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
    %__auto.blk.18.ffn_gate.weight = util.global.load @__auto.blk.18.ffn_gate.weight : tensor<14336x4096xf16>
    %170 = torch_c.from_builtin_tensor %__auto.blk.18.ffn_gate.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
    %__auto.blk.18.ffn_up.weight = util.global.load @__auto.blk.18.ffn_up.weight : tensor<14336x4096xf16>
    %171 = torch_c.from_builtin_tensor %__auto.blk.18.ffn_up.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
    %__auto.blk.18.ffn_down.weight = util.global.load @__auto.blk.18.ffn_down.weight : tensor<4096x14336xf16>
    %172 = torch_c.from_builtin_tensor %__auto.blk.18.ffn_down.weight : tensor<4096x14336xf16> -> !torch.vtensor<[4096,14336],f16>
    %__auto.blk.19.attn_norm.weight = util.global.load @__auto.blk.19.attn_norm.weight : tensor<4096xf32>
    %173 = torch_c.from_builtin_tensor %__auto.blk.19.attn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
    %__auto.blk.19.attn_q.weight = util.global.load @__auto.blk.19.attn_q.weight : tensor<4096x4096xf16>
    %174 = torch_c.from_builtin_tensor %__auto.blk.19.attn_q.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
    %__auto.blk.19.attn_k.weight = util.global.load @__auto.blk.19.attn_k.weight : tensor<1024x4096xf16>
    %175 = torch_c.from_builtin_tensor %__auto.blk.19.attn_k.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
    %__auto.blk.19.attn_v.weight = util.global.load @__auto.blk.19.attn_v.weight : tensor<1024x4096xf16>
    %176 = torch_c.from_builtin_tensor %__auto.blk.19.attn_v.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
    %__auto.blk.19.attn_output.weight = util.global.load @__auto.blk.19.attn_output.weight : tensor<4096x4096xf16>
    %177 = torch_c.from_builtin_tensor %__auto.blk.19.attn_output.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
    %__auto.blk.19.ffn_norm.weight = util.global.load @__auto.blk.19.ffn_norm.weight : tensor<4096xf32>
    %178 = torch_c.from_builtin_tensor %__auto.blk.19.ffn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
    %__auto.blk.19.ffn_gate.weight = util.global.load @__auto.blk.19.ffn_gate.weight : tensor<14336x4096xf16>
    %179 = torch_c.from_builtin_tensor %__auto.blk.19.ffn_gate.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
    %__auto.blk.19.ffn_up.weight = util.global.load @__auto.blk.19.ffn_up.weight : tensor<14336x4096xf16>
    %180 = torch_c.from_builtin_tensor %__auto.blk.19.ffn_up.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
    %__auto.blk.19.ffn_down.weight = util.global.load @__auto.blk.19.ffn_down.weight : tensor<4096x14336xf16>
    %181 = torch_c.from_builtin_tensor %__auto.blk.19.ffn_down.weight : tensor<4096x14336xf16> -> !torch.vtensor<[4096,14336],f16>
    %__auto.blk.20.attn_norm.weight = util.global.load @__auto.blk.20.attn_norm.weight : tensor<4096xf32>
    %182 = torch_c.from_builtin_tensor %__auto.blk.20.attn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
    %__auto.blk.20.attn_q.weight = util.global.load @__auto.blk.20.attn_q.weight : tensor<4096x4096xf16>
    %183 = torch_c.from_builtin_tensor %__auto.blk.20.attn_q.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
    %__auto.blk.20.attn_k.weight = util.global.load @__auto.blk.20.attn_k.weight : tensor<1024x4096xf16>
    %184 = torch_c.from_builtin_tensor %__auto.blk.20.attn_k.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
    %__auto.blk.20.attn_v.weight = util.global.load @__auto.blk.20.attn_v.weight : tensor<1024x4096xf16>
    %185 = torch_c.from_builtin_tensor %__auto.blk.20.attn_v.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
    %__auto.blk.20.attn_output.weight = util.global.load @__auto.blk.20.attn_output.weight : tensor<4096x4096xf16>
    %186 = torch_c.from_builtin_tensor %__auto.blk.20.attn_output.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
    %__auto.blk.20.ffn_norm.weight = util.global.load @__auto.blk.20.ffn_norm.weight : tensor<4096xf32>
    %187 = torch_c.from_builtin_tensor %__auto.blk.20.ffn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
    %__auto.blk.20.ffn_gate.weight = util.global.load @__auto.blk.20.ffn_gate.weight : tensor<14336x4096xf16>
    %188 = torch_c.from_builtin_tensor %__auto.blk.20.ffn_gate.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
    %__auto.blk.20.ffn_up.weight = util.global.load @__auto.blk.20.ffn_up.weight : tensor<14336x4096xf16>
    %189 = torch_c.from_builtin_tensor %__auto.blk.20.ffn_up.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
    %__auto.blk.20.ffn_down.weight = util.global.load @__auto.blk.20.ffn_down.weight : tensor<4096x14336xf16>
    %190 = torch_c.from_builtin_tensor %__auto.blk.20.ffn_down.weight : tensor<4096x14336xf16> -> !torch.vtensor<[4096,14336],f16>
    %__auto.blk.21.attn_norm.weight = util.global.load @__auto.blk.21.attn_norm.weight : tensor<4096xf32>
    %191 = torch_c.from_builtin_tensor %__auto.blk.21.attn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
    %__auto.blk.21.attn_q.weight = util.global.load @__auto.blk.21.attn_q.weight : tensor<4096x4096xf16>
    %192 = torch_c.from_builtin_tensor %__auto.blk.21.attn_q.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
    %__auto.blk.21.attn_k.weight = util.global.load @__auto.blk.21.attn_k.weight : tensor<1024x4096xf16>
    %193 = torch_c.from_builtin_tensor %__auto.blk.21.attn_k.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
    %__auto.blk.21.attn_v.weight = util.global.load @__auto.blk.21.attn_v.weight : tensor<1024x4096xf16>
    %194 = torch_c.from_builtin_tensor %__auto.blk.21.attn_v.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
    %__auto.blk.21.attn_output.weight = util.global.load @__auto.blk.21.attn_output.weight : tensor<4096x4096xf16>
    %195 = torch_c.from_builtin_tensor %__auto.blk.21.attn_output.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
    %__auto.blk.21.ffn_norm.weight = util.global.load @__auto.blk.21.ffn_norm.weight : tensor<4096xf32>
    %196 = torch_c.from_builtin_tensor %__auto.blk.21.ffn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
    %__auto.blk.21.ffn_gate.weight = util.global.load @__auto.blk.21.ffn_gate.weight : tensor<14336x4096xf16>
    %197 = torch_c.from_builtin_tensor %__auto.blk.21.ffn_gate.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
    %__auto.blk.21.ffn_up.weight = util.global.load @__auto.blk.21.ffn_up.weight : tensor<14336x4096xf16>
    %198 = torch_c.from_builtin_tensor %__auto.blk.21.ffn_up.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
    %__auto.blk.21.ffn_down.weight = util.global.load @__auto.blk.21.ffn_down.weight : tensor<4096x14336xf16>
    %199 = torch_c.from_builtin_tensor %__auto.blk.21.ffn_down.weight : tensor<4096x14336xf16> -> !torch.vtensor<[4096,14336],f16>
    %__auto.blk.22.attn_norm.weight = util.global.load @__auto.blk.22.attn_norm.weight : tensor<4096xf32>
    %200 = torch_c.from_builtin_tensor %__auto.blk.22.attn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
    %__auto.blk.22.attn_q.weight = util.global.load @__auto.blk.22.attn_q.weight : tensor<4096x4096xf16>
    %201 = torch_c.from_builtin_tensor %__auto.blk.22.attn_q.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
    %__auto.blk.22.attn_k.weight = util.global.load @__auto.blk.22.attn_k.weight : tensor<1024x4096xf16>
    %202 = torch_c.from_builtin_tensor %__auto.blk.22.attn_k.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
    %__auto.blk.22.attn_v.weight = util.global.load @__auto.blk.22.attn_v.weight : tensor<1024x4096xf16>
    %203 = torch_c.from_builtin_tensor %__auto.blk.22.attn_v.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
    %__auto.blk.22.attn_output.weight = util.global.load @__auto.blk.22.attn_output.weight : tensor<4096x4096xf16>
    %204 = torch_c.from_builtin_tensor %__auto.blk.22.attn_output.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
    %__auto.blk.22.ffn_norm.weight = util.global.load @__auto.blk.22.ffn_norm.weight : tensor<4096xf32>
    %205 = torch_c.from_builtin_tensor %__auto.blk.22.ffn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
    %__auto.blk.22.ffn_gate.weight = util.global.load @__auto.blk.22.ffn_gate.weight : tensor<14336x4096xf16>
    %206 = torch_c.from_builtin_tensor %__auto.blk.22.ffn_gate.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
    %__auto.blk.22.ffn_up.weight = util.global.load @__auto.blk.22.ffn_up.weight : tensor<14336x4096xf16>
    %207 = torch_c.from_builtin_tensor %__auto.blk.22.ffn_up.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
    %__auto.blk.22.ffn_down.weight = util.global.load @__auto.blk.22.ffn_down.weight : tensor<4096x14336xf16>
    %208 = torch_c.from_builtin_tensor %__auto.blk.22.ffn_down.weight : tensor<4096x14336xf16> -> !torch.vtensor<[4096,14336],f16>
    %__auto.blk.23.attn_norm.weight = util.global.load @__auto.blk.23.attn_norm.weight : tensor<4096xf32>
    %209 = torch_c.from_builtin_tensor %__auto.blk.23.attn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
    %__auto.blk.23.attn_q.weight = util.global.load @__auto.blk.23.attn_q.weight : tensor<4096x4096xf16>
    %210 = torch_c.from_builtin_tensor %__auto.blk.23.attn_q.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
    %__auto.blk.23.attn_k.weight = util.global.load @__auto.blk.23.attn_k.weight : tensor<1024x4096xf16>
    %211 = torch_c.from_builtin_tensor %__auto.blk.23.attn_k.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
    %__auto.blk.23.attn_v.weight = util.global.load @__auto.blk.23.attn_v.weight : tensor<1024x4096xf16>
    %212 = torch_c.from_builtin_tensor %__auto.blk.23.attn_v.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
    %__auto.blk.23.attn_output.weight = util.global.load @__auto.blk.23.attn_output.weight : tensor<4096x4096xf16>
    %213 = torch_c.from_builtin_tensor %__auto.blk.23.attn_output.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
    %__auto.blk.23.ffn_norm.weight = util.global.load @__auto.blk.23.ffn_norm.weight : tensor<4096xf32>
    %214 = torch_c.from_builtin_tensor %__auto.blk.23.ffn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
    %__auto.blk.23.ffn_gate.weight = util.global.load @__auto.blk.23.ffn_gate.weight : tensor<14336x4096xf16>
    %215 = torch_c.from_builtin_tensor %__auto.blk.23.ffn_gate.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
    %__auto.blk.23.ffn_up.weight = util.global.load @__auto.blk.23.ffn_up.weight : tensor<14336x4096xf16>
    %216 = torch_c.from_builtin_tensor %__auto.blk.23.ffn_up.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
    %__auto.blk.23.ffn_down.weight = util.global.load @__auto.blk.23.ffn_down.weight : tensor<4096x14336xf16>
    %217 = torch_c.from_builtin_tensor %__auto.blk.23.ffn_down.weight : tensor<4096x14336xf16> -> !torch.vtensor<[4096,14336],f16>
    %__auto.blk.24.attn_norm.weight = util.global.load @__auto.blk.24.attn_norm.weight : tensor<4096xf32>
    %218 = torch_c.from_builtin_tensor %__auto.blk.24.attn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
    %__auto.blk.24.attn_q.weight = util.global.load @__auto.blk.24.attn_q.weight : tensor<4096x4096xf16>
    %219 = torch_c.from_builtin_tensor %__auto.blk.24.attn_q.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
    %__auto.blk.24.attn_k.weight = util.global.load @__auto.blk.24.attn_k.weight : tensor<1024x4096xf16>
    %220 = torch_c.from_builtin_tensor %__auto.blk.24.attn_k.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
    %__auto.blk.24.attn_v.weight = util.global.load @__auto.blk.24.attn_v.weight : tensor<1024x4096xf16>
    %221 = torch_c.from_builtin_tensor %__auto.blk.24.attn_v.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
    %__auto.blk.24.attn_output.weight = util.global.load @__auto.blk.24.attn_output.weight : tensor<4096x4096xf16>
    %222 = torch_c.from_builtin_tensor %__auto.blk.24.attn_output.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
    %__auto.blk.24.ffn_norm.weight = util.global.load @__auto.blk.24.ffn_norm.weight : tensor<4096xf32>
    %223 = torch_c.from_builtin_tensor %__auto.blk.24.ffn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
    %__auto.blk.24.ffn_gate.weight = util.global.load @__auto.blk.24.ffn_gate.weight : tensor<14336x4096xf16>
    %224 = torch_c.from_builtin_tensor %__auto.blk.24.ffn_gate.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
    %__auto.blk.24.ffn_up.weight = util.global.load @__auto.blk.24.ffn_up.weight : tensor<14336x4096xf16>
    %225 = torch_c.from_builtin_tensor %__auto.blk.24.ffn_up.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
    %__auto.blk.24.ffn_down.weight = util.global.load @__auto.blk.24.ffn_down.weight : tensor<4096x14336xf16>
    %226 = torch_c.from_builtin_tensor %__auto.blk.24.ffn_down.weight : tensor<4096x14336xf16> -> !torch.vtensor<[4096,14336],f16>
    %__auto.blk.25.attn_norm.weight = util.global.load @__auto.blk.25.attn_norm.weight : tensor<4096xf32>
    %227 = torch_c.from_builtin_tensor %__auto.blk.25.attn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
    %__auto.blk.25.attn_q.weight = util.global.load @__auto.blk.25.attn_q.weight : tensor<4096x4096xf16>
    %228 = torch_c.from_builtin_tensor %__auto.blk.25.attn_q.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
    %__auto.blk.25.attn_k.weight = util.global.load @__auto.blk.25.attn_k.weight : tensor<1024x4096xf16>
    %229 = torch_c.from_builtin_tensor %__auto.blk.25.attn_k.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
    %__auto.blk.25.attn_v.weight = util.global.load @__auto.blk.25.attn_v.weight : tensor<1024x4096xf16>
    %230 = torch_c.from_builtin_tensor %__auto.blk.25.attn_v.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
    %__auto.blk.25.attn_output.weight = util.global.load @__auto.blk.25.attn_output.weight : tensor<4096x4096xf16>
    %231 = torch_c.from_builtin_tensor %__auto.blk.25.attn_output.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
    %__auto.blk.25.ffn_norm.weight = util.global.load @__auto.blk.25.ffn_norm.weight : tensor<4096xf32>
    %232 = torch_c.from_builtin_tensor %__auto.blk.25.ffn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
    %__auto.blk.25.ffn_gate.weight = util.global.load @__auto.blk.25.ffn_gate.weight : tensor<14336x4096xf16>
    %233 = torch_c.from_builtin_tensor %__auto.blk.25.ffn_gate.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
    %__auto.blk.25.ffn_up.weight = util.global.load @__auto.blk.25.ffn_up.weight : tensor<14336x4096xf16>
    %234 = torch_c.from_builtin_tensor %__auto.blk.25.ffn_up.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
    %__auto.blk.25.ffn_down.weight = util.global.load @__auto.blk.25.ffn_down.weight : tensor<4096x14336xf16>
    %235 = torch_c.from_builtin_tensor %__auto.blk.25.ffn_down.weight : tensor<4096x14336xf16> -> !torch.vtensor<[4096,14336],f16>
    %__auto.blk.26.attn_norm.weight = util.global.load @__auto.blk.26.attn_norm.weight : tensor<4096xf32>
    %236 = torch_c.from_builtin_tensor %__auto.blk.26.attn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
    %__auto.blk.26.attn_q.weight = util.global.load @__auto.blk.26.attn_q.weight : tensor<4096x4096xf16>
    %237 = torch_c.from_builtin_tensor %__auto.blk.26.attn_q.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
    %__auto.blk.26.attn_k.weight = util.global.load @__auto.blk.26.attn_k.weight : tensor<1024x4096xf16>
    %238 = torch_c.from_builtin_tensor %__auto.blk.26.attn_k.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
    %__auto.blk.26.attn_v.weight = util.global.load @__auto.blk.26.attn_v.weight : tensor<1024x4096xf16>
    %239 = torch_c.from_builtin_tensor %__auto.blk.26.attn_v.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
    %__auto.blk.26.attn_output.weight = util.global.load @__auto.blk.26.attn_output.weight : tensor<4096x4096xf16>
    %240 = torch_c.from_builtin_tensor %__auto.blk.26.attn_output.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
    %__auto.blk.26.ffn_norm.weight = util.global.load @__auto.blk.26.ffn_norm.weight : tensor<4096xf32>
    %241 = torch_c.from_builtin_tensor %__auto.blk.26.ffn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
    %__auto.blk.26.ffn_gate.weight = util.global.load @__auto.blk.26.ffn_gate.weight : tensor<14336x4096xf16>
    %242 = torch_c.from_builtin_tensor %__auto.blk.26.ffn_gate.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
    %__auto.blk.26.ffn_up.weight = util.global.load @__auto.blk.26.ffn_up.weight : tensor<14336x4096xf16>
    %243 = torch_c.from_builtin_tensor %__auto.blk.26.ffn_up.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
    %__auto.blk.26.ffn_down.weight = util.global.load @__auto.blk.26.ffn_down.weight : tensor<4096x14336xf16>
    %244 = torch_c.from_builtin_tensor %__auto.blk.26.ffn_down.weight : tensor<4096x14336xf16> -> !torch.vtensor<[4096,14336],f16>
    %__auto.blk.27.attn_norm.weight = util.global.load @__auto.blk.27.attn_norm.weight : tensor<4096xf32>
    %245 = torch_c.from_builtin_tensor %__auto.blk.27.attn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
    %__auto.blk.27.attn_q.weight = util.global.load @__auto.blk.27.attn_q.weight : tensor<4096x4096xf16>
    %246 = torch_c.from_builtin_tensor %__auto.blk.27.attn_q.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
    %__auto.blk.27.attn_k.weight = util.global.load @__auto.blk.27.attn_k.weight : tensor<1024x4096xf16>
    %247 = torch_c.from_builtin_tensor %__auto.blk.27.attn_k.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
    %__auto.blk.27.attn_v.weight = util.global.load @__auto.blk.27.attn_v.weight : tensor<1024x4096xf16>
    %248 = torch_c.from_builtin_tensor %__auto.blk.27.attn_v.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
    %__auto.blk.27.attn_output.weight = util.global.load @__auto.blk.27.attn_output.weight : tensor<4096x4096xf16>
    %249 = torch_c.from_builtin_tensor %__auto.blk.27.attn_output.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
    %__auto.blk.27.ffn_norm.weight = util.global.load @__auto.blk.27.ffn_norm.weight : tensor<4096xf32>
    %250 = torch_c.from_builtin_tensor %__auto.blk.27.ffn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
    %__auto.blk.27.ffn_gate.weight = util.global.load @__auto.blk.27.ffn_gate.weight : tensor<14336x4096xf16>
    %251 = torch_c.from_builtin_tensor %__auto.blk.27.ffn_gate.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
    %__auto.blk.27.ffn_up.weight = util.global.load @__auto.blk.27.ffn_up.weight : tensor<14336x4096xf16>
    %252 = torch_c.from_builtin_tensor %__auto.blk.27.ffn_up.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
    %__auto.blk.27.ffn_down.weight = util.global.load @__auto.blk.27.ffn_down.weight : tensor<4096x14336xf16>
    %253 = torch_c.from_builtin_tensor %__auto.blk.27.ffn_down.weight : tensor<4096x14336xf16> -> !torch.vtensor<[4096,14336],f16>
    %__auto.blk.28.attn_norm.weight = util.global.load @__auto.blk.28.attn_norm.weight : tensor<4096xf32>
    %254 = torch_c.from_builtin_tensor %__auto.blk.28.attn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
    %__auto.blk.28.attn_q.weight = util.global.load @__auto.blk.28.attn_q.weight : tensor<4096x4096xf16>
    %255 = torch_c.from_builtin_tensor %__auto.blk.28.attn_q.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
    %__auto.blk.28.attn_k.weight = util.global.load @__auto.blk.28.attn_k.weight : tensor<1024x4096xf16>
    %256 = torch_c.from_builtin_tensor %__auto.blk.28.attn_k.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
    %__auto.blk.28.attn_v.weight = util.global.load @__auto.blk.28.attn_v.weight : tensor<1024x4096xf16>
    %257 = torch_c.from_builtin_tensor %__auto.blk.28.attn_v.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
    %__auto.blk.28.attn_output.weight = util.global.load @__auto.blk.28.attn_output.weight : tensor<4096x4096xf16>
    %258 = torch_c.from_builtin_tensor %__auto.blk.28.attn_output.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
    %__auto.blk.28.ffn_norm.weight = util.global.load @__auto.blk.28.ffn_norm.weight : tensor<4096xf32>
    %259 = torch_c.from_builtin_tensor %__auto.blk.28.ffn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
    %__auto.blk.28.ffn_gate.weight = util.global.load @__auto.blk.28.ffn_gate.weight : tensor<14336x4096xf16>
    %260 = torch_c.from_builtin_tensor %__auto.blk.28.ffn_gate.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
    %__auto.blk.28.ffn_up.weight = util.global.load @__auto.blk.28.ffn_up.weight : tensor<14336x4096xf16>
    %261 = torch_c.from_builtin_tensor %__auto.blk.28.ffn_up.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
    %__auto.blk.28.ffn_down.weight = util.global.load @__auto.blk.28.ffn_down.weight : tensor<4096x14336xf16>
    %262 = torch_c.from_builtin_tensor %__auto.blk.28.ffn_down.weight : tensor<4096x14336xf16> -> !torch.vtensor<[4096,14336],f16>
    %__auto.blk.29.attn_norm.weight = util.global.load @__auto.blk.29.attn_norm.weight : tensor<4096xf32>
    %263 = torch_c.from_builtin_tensor %__auto.blk.29.attn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
    %__auto.blk.29.attn_q.weight = util.global.load @__auto.blk.29.attn_q.weight : tensor<4096x4096xf16>
    %264 = torch_c.from_builtin_tensor %__auto.blk.29.attn_q.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
    %__auto.blk.29.attn_k.weight = util.global.load @__auto.blk.29.attn_k.weight : tensor<1024x4096xf16>
    %265 = torch_c.from_builtin_tensor %__auto.blk.29.attn_k.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
    %__auto.blk.29.attn_v.weight = util.global.load @__auto.blk.29.attn_v.weight : tensor<1024x4096xf16>
    %266 = torch_c.from_builtin_tensor %__auto.blk.29.attn_v.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
    %__auto.blk.29.attn_output.weight = util.global.load @__auto.blk.29.attn_output.weight : tensor<4096x4096xf16>
    %267 = torch_c.from_builtin_tensor %__auto.blk.29.attn_output.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
    %__auto.blk.29.ffn_norm.weight = util.global.load @__auto.blk.29.ffn_norm.weight : tensor<4096xf32>
    %268 = torch_c.from_builtin_tensor %__auto.blk.29.ffn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
    %__auto.blk.29.ffn_gate.weight = util.global.load @__auto.blk.29.ffn_gate.weight : tensor<14336x4096xf16>
    %269 = torch_c.from_builtin_tensor %__auto.blk.29.ffn_gate.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
    %__auto.blk.29.ffn_up.weight = util.global.load @__auto.blk.29.ffn_up.weight : tensor<14336x4096xf16>
    %270 = torch_c.from_builtin_tensor %__auto.blk.29.ffn_up.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
    %__auto.blk.29.ffn_down.weight = util.global.load @__auto.blk.29.ffn_down.weight : tensor<4096x14336xf16>
    %271 = torch_c.from_builtin_tensor %__auto.blk.29.ffn_down.weight : tensor<4096x14336xf16> -> !torch.vtensor<[4096,14336],f16>
    %__auto.blk.30.attn_norm.weight = util.global.load @__auto.blk.30.attn_norm.weight : tensor<4096xf32>
    %272 = torch_c.from_builtin_tensor %__auto.blk.30.attn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
    %__auto.blk.30.attn_q.weight = util.global.load @__auto.blk.30.attn_q.weight : tensor<4096x4096xf16>
    %273 = torch_c.from_builtin_tensor %__auto.blk.30.attn_q.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
    %__auto.blk.30.attn_k.weight = util.global.load @__auto.blk.30.attn_k.weight : tensor<1024x4096xf16>
    %274 = torch_c.from_builtin_tensor %__auto.blk.30.attn_k.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
    %__auto.blk.30.attn_v.weight = util.global.load @__auto.blk.30.attn_v.weight : tensor<1024x4096xf16>
    %275 = torch_c.from_builtin_tensor %__auto.blk.30.attn_v.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
    %__auto.blk.30.attn_output.weight = util.global.load @__auto.blk.30.attn_output.weight : tensor<4096x4096xf16>
    %276 = torch_c.from_builtin_tensor %__auto.blk.30.attn_output.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
    %__auto.blk.30.ffn_norm.weight = util.global.load @__auto.blk.30.ffn_norm.weight : tensor<4096xf32>
    %277 = torch_c.from_builtin_tensor %__auto.blk.30.ffn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
    %__auto.blk.30.ffn_gate.weight = util.global.load @__auto.blk.30.ffn_gate.weight : tensor<14336x4096xf16>
    %278 = torch_c.from_builtin_tensor %__auto.blk.30.ffn_gate.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
    %__auto.blk.30.ffn_up.weight = util.global.load @__auto.blk.30.ffn_up.weight : tensor<14336x4096xf16>
    %279 = torch_c.from_builtin_tensor %__auto.blk.30.ffn_up.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
    %__auto.blk.30.ffn_down.weight = util.global.load @__auto.blk.30.ffn_down.weight : tensor<4096x14336xf16>
    %280 = torch_c.from_builtin_tensor %__auto.blk.30.ffn_down.weight : tensor<4096x14336xf16> -> !torch.vtensor<[4096,14336],f16>
    %__auto.blk.31.attn_norm.weight = util.global.load @__auto.blk.31.attn_norm.weight : tensor<4096xf32>
    %281 = torch_c.from_builtin_tensor %__auto.blk.31.attn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
    %__auto.blk.31.attn_q.weight = util.global.load @__auto.blk.31.attn_q.weight : tensor<4096x4096xf16>
    %282 = torch_c.from_builtin_tensor %__auto.blk.31.attn_q.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
    %__auto.blk.31.attn_k.weight = util.global.load @__auto.blk.31.attn_k.weight : tensor<1024x4096xf16>
    %283 = torch_c.from_builtin_tensor %__auto.blk.31.attn_k.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
    %__auto.blk.31.attn_v.weight = util.global.load @__auto.blk.31.attn_v.weight : tensor<1024x4096xf16>
    %284 = torch_c.from_builtin_tensor %__auto.blk.31.attn_v.weight : tensor<1024x4096xf16> -> !torch.vtensor<[1024,4096],f16>
    %__auto.blk.31.attn_output.weight = util.global.load @__auto.blk.31.attn_output.weight : tensor<4096x4096xf16>
    %285 = torch_c.from_builtin_tensor %__auto.blk.31.attn_output.weight : tensor<4096x4096xf16> -> !torch.vtensor<[4096,4096],f16>
    %__auto.blk.31.ffn_norm.weight = util.global.load @__auto.blk.31.ffn_norm.weight : tensor<4096xf32>
    %286 = torch_c.from_builtin_tensor %__auto.blk.31.ffn_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
    %__auto.blk.31.ffn_gate.weight = util.global.load @__auto.blk.31.ffn_gate.weight : tensor<14336x4096xf16>
    %287 = torch_c.from_builtin_tensor %__auto.blk.31.ffn_gate.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
    %__auto.blk.31.ffn_up.weight = util.global.load @__auto.blk.31.ffn_up.weight : tensor<14336x4096xf16>
    %288 = torch_c.from_builtin_tensor %__auto.blk.31.ffn_up.weight : tensor<14336x4096xf16> -> !torch.vtensor<[14336,4096],f16>
    %__auto.blk.31.ffn_down.weight = util.global.load @__auto.blk.31.ffn_down.weight : tensor<4096x14336xf16>
    %289 = torch_c.from_builtin_tensor %__auto.blk.31.ffn_down.weight : tensor<4096x14336xf16> -> !torch.vtensor<[4096,14336],f16>
    %__auto.output_norm.weight = util.global.load @__auto.output_norm.weight : tensor<4096xf32>
    %290 = torch_c.from_builtin_tensor %__auto.output_norm.weight : tensor<4096xf32> -> !torch.vtensor<[4096],f32>
    %__auto.output.weight = util.global.load @__auto.output.weight : tensor<128256x4096xf16>
    %291 = torch_c.from_builtin_tensor %__auto.output.weight : tensor<128256x4096xf16> -> !torch.vtensor<[128256,4096],f16>
    %292 = torch.copy.to_vtensor %arg3 : !torch.vtensor<[?,1048576],f32>
    %int1 = torch.constant.int 1
    %293 = torch.aten.size.int %arg0, %int1 : !torch.vtensor<[4,?],si64>, !torch.int -> !torch.int
    %int0 = torch.constant.int 0
    %int1_0 = torch.constant.int 1
    %none = torch.constant.none
    %none_1 = torch.constant.none
    %cpu = torch.constant.device "cpu"
    %false = torch.constant.bool false
    %294 = torch.aten.arange.start_step %int0, %293, %int1_0, %none, %none_1, %cpu, %false : !torch.int, !torch.int, !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[?],si64>
    %int-1 = torch.constant.int -1
    %295 = torch.aten.unsqueeze %arg1, %int-1 : !torch.vtensor<[4],si64>, !torch.int -> !torch.vtensor<[4,1],si64>
    %296 = torch.aten.ge.Tensor %294, %295 : !torch.vtensor<[?],si64>, !torch.vtensor<[4,1],si64> -> !torch.vtensor<[4,?],i1>
    %__auto.constant_1_1_8192_8192_torch.bool = util.global.load @__auto.constant_1_1_8192_8192_torch.bool : tensor<1x1x8192x8192xi1>
    %297 = torch_c.from_builtin_tensor %__auto.constant_1_1_8192_8192_torch.bool : tensor<1x1x8192x8192xi1> -> !torch.vtensor<[1,1,8192,8192],i1>
    %int0_2 = torch.constant.int 0
    %int0_3 = torch.constant.int 0
    %int9223372036854775807 = torch.constant.int 9223372036854775807
    %int1_4 = torch.constant.int 1
    %298 = torch.aten.slice.Tensor %297, %int0_2, %int0_3, %int9223372036854775807, %int1_4 : !torch.vtensor<[1,1,8192,8192],i1>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,8192,8192],i1>
    %int1_5 = torch.constant.int 1
    %int0_6 = torch.constant.int 0
    %int9223372036854775807_7 = torch.constant.int 9223372036854775807
    %int1_8 = torch.constant.int 1
    %299 = torch.aten.slice.Tensor %298, %int1_5, %int0_6, %int9223372036854775807_7, %int1_8 : !torch.vtensor<[1,1,8192,8192],i1>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,8192,8192],i1>
    %int0_9 = torch.constant.int 0
    %300 = torch.aten.size.int %294, %int0_9 : !torch.vtensor<[?],si64>, !torch.int -> !torch.int
    %int2 = torch.constant.int 2
    %int0_10 = torch.constant.int 0
    %int1_11 = torch.constant.int 1
    %301 = torch.aten.slice.Tensor %299, %int2, %int0_10, %300, %int1_11 : !torch.vtensor<[1,1,8192,8192],i1>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,?,8192],i1>
    %int3 = torch.constant.int 3
    %int0_12 = torch.constant.int 0
    %int1_13 = torch.constant.int 1
    %302 = torch.aten.slice.Tensor %301, %int3, %int0_12, %300, %int1_13 : !torch.vtensor<[1,1,?,8192],i1>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,?,?],i1>
    %int0_14 = torch.constant.int 0
    %int0_15 = torch.constant.int 0
    %int9223372036854775807_16 = torch.constant.int 9223372036854775807
    %int1_17 = torch.constant.int 1
    %303 = torch.aten.slice.Tensor %296, %int0_14, %int0_15, %int9223372036854775807_16, %int1_17 : !torch.vtensor<[4,?],i1>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[4,?],i1>
    %int1_18 = torch.constant.int 1
    %304 = torch.aten.unsqueeze %303, %int1_18 : !torch.vtensor<[4,?],i1>, !torch.int -> !torch.vtensor<[4,1,?],i1>
    %int2_19 = torch.constant.int 2
    %305 = torch.aten.unsqueeze %304, %int2_19 : !torch.vtensor<[4,1,?],i1>, !torch.int -> !torch.vtensor<[4,1,1,?],i1>
    %int3_20 = torch.constant.int 3
    %int0_21 = torch.constant.int 0
    %int9223372036854775807_22 = torch.constant.int 9223372036854775807
    %int1_23 = torch.constant.int 1
    %306 = torch.aten.slice.Tensor %305, %int3_20, %int0_21, %int9223372036854775807_22, %int1_23 : !torch.vtensor<[4,1,1,?],i1>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[4,1,1,?],i1>
    %int1_24 = torch.constant.int 1
    %307 = torch.aten.add.Tensor %302, %306, %int1_24 : !torch.vtensor<[1,1,?,?],i1>, !torch.vtensor<[4,1,1,?],i1>, !torch.int -> !torch.vtensor<[4,1,?,?],i1>
    %int6 = torch.constant.int 6
    %none_25 = torch.constant.none
    %none_26 = torch.constant.none
    %false_27 = torch.constant.bool false
    %none_28 = torch.constant.none
    %308 = torch.aten.zeros_like %307, %int6, %none_25, %none_26, %false_27, %none_28 : !torch.vtensor<[4,1,?,?],i1>, !torch.int, !torch.none, !torch.none, !torch.bool, !torch.none -> !torch.vtensor<[4,1,?,?],f32>
    %float-Inf = torch.constant.float 0xFFF0000000000000
    %int6_29 = torch.constant.int 6
    %int0_30 = torch.constant.int 0
    %cpu_31 = torch.constant.device "cpu"
    %none_32 = torch.constant.none
    %309 = torch.aten.scalar_tensor %float-Inf, %int6_29, %int0_30, %cpu_31, %none_32 : !torch.float, !torch.int, !torch.int, !torch.Device, !torch.none -> !torch.vtensor<[],f32>
    %310 = torch.aten.where.self %307, %309, %308 : !torch.vtensor<[4,1,?,?],i1>, !torch.vtensor<[],f32>, !torch.vtensor<[4,1,?,?],f32> -> !torch.vtensor<[4,1,?,?],f32>
    %int6_33 = torch.constant.int 6
    %311 = torch.prims.convert_element_type %0, %int6_33 : !torch.vtensor<[128256,4096],f16>, !torch.int -> !torch.vtensor<[128256,4096],f32>
    %int-1_34 = torch.constant.int -1
    %false_35 = torch.constant.bool false
    %false_36 = torch.constant.bool false
    %312 = torch.aten.embedding %311, %arg0, %int-1_34, %false_35, %false_36 : !torch.vtensor<[128256,4096],f32>, !torch.vtensor<[4,?],si64>, !torch.int, !torch.bool, !torch.bool -> !torch.vtensor<[4,?,4096],f32>
    return %312 : !torch.vtensor<[4,?,4096],f32>
  }
 }