Created
May 14, 2025 22:58
-
-
Save pashu123/b07a3988248cc8bb94139a9f2069153c to your computer and use it in GitHub Desktop.
This file has been truncated, but you can view the full file.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d3)> | |
#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d4, d3)> | |
#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d4, d2)> | |
#map3 = affine_map<(d0, d1, d2, d3, d4) -> ()> | |
#map4 = affine_map<(d0, d1, d2, d3, d4) -> (d1, d4)> | |
#map5 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)> | |
module @module { | |
util.global private @__auto.token_embd.weight = #stream.parameter.named<"model"::"token_embd.weight"> : tensor<128256x4096xbf16> | |
util.global private @__auto.blk.0.attn_norm.weight = #stream.parameter.named<"model"::"blk.0.attn_norm.weight"> : tensor<4096xbf16> | |
util.global private @"__auto.blk.0.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.0.attn_q.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.0.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.0.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.0.attn_q.q_output:rscale" = #stream.parameter.named<"model"::"blk.0.attn_q.q_output:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.0.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.0.attn_k.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.0.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.0.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.0.attn_k.q_output:rscale" = #stream.parameter.named<"model"::"blk.0.attn_k.q_output:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.0.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.0.attn_v.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.0.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.0.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.0.attn_v.q_output:rscale" = #stream.parameter.named<"model"::"blk.0.attn_v.q_output:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.0.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.0.attn_output.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.0.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.0.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.0.attn_output.weight:d" = #stream.parameter.named<"model"::"blk.0.attn_output.weight:d"> : tensor<f32> | |
util.global private @__auto.blk.0.ffn_norm.weight = #stream.parameter.named<"model"::"blk.0.ffn_norm.weight"> : tensor<4096xbf16> | |
util.global private @"__auto.blk.0.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.0.ffn_gate.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.0.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.0.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.0.ffn_gate.weight:d" = #stream.parameter.named<"model"::"blk.0.ffn_gate.weight:d"> : tensor<f32> | |
util.global private @"__auto.blk.0.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.0.ffn_up.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.0.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.0.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.0.ffn_up.weight:d" = #stream.parameter.named<"model"::"blk.0.ffn_up.weight:d"> : tensor<f32> | |
util.global private @"__auto.blk.0.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.0.ffn_down.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.0.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.0.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ> | |
util.global private @"__auto.blk.0.ffn_down.weight:d" = #stream.parameter.named<"model"::"blk.0.ffn_down.weight:d"> : tensor<f32> | |
util.global private @__auto.blk.1.attn_norm.weight = #stream.parameter.named<"model"::"blk.1.attn_norm.weight"> : tensor<4096xbf16> | |
util.global private @"__auto.blk.1.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.1.attn_q.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.1.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.1.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.1.attn_q.q_output:rscale" = #stream.parameter.named<"model"::"blk.1.attn_q.q_output:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.1.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.1.attn_k.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.1.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.1.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.1.attn_k.q_output:rscale" = #stream.parameter.named<"model"::"blk.1.attn_k.q_output:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.1.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.1.attn_v.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.1.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.1.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.1.attn_v.q_output:rscale" = #stream.parameter.named<"model"::"blk.1.attn_v.q_output:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.1.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.1.attn_output.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.1.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.1.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.1.attn_output.weight:d" = #stream.parameter.named<"model"::"blk.1.attn_output.weight:d"> : tensor<f32> | |
util.global private @__auto.blk.1.ffn_norm.weight = #stream.parameter.named<"model"::"blk.1.ffn_norm.weight"> : tensor<4096xbf16> | |
util.global private @"__auto.blk.1.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.1.ffn_gate.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.1.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.1.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.1.ffn_gate.weight:d" = #stream.parameter.named<"model"::"blk.1.ffn_gate.weight:d"> : tensor<f32> | |
util.global private @"__auto.blk.1.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.1.ffn_up.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.1.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.1.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.1.ffn_up.weight:d" = #stream.parameter.named<"model"::"blk.1.ffn_up.weight:d"> : tensor<f32> | |
util.global private @"__auto.blk.1.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.1.ffn_down.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.1.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.1.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ> | |
util.global private @"__auto.blk.1.ffn_down.weight:d" = #stream.parameter.named<"model"::"blk.1.ffn_down.weight:d"> : tensor<f32> | |
util.global private @__auto.blk.2.attn_norm.weight = #stream.parameter.named<"model"::"blk.2.attn_norm.weight"> : tensor<4096xbf16> | |
util.global private @"__auto.blk.2.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.2.attn_q.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.2.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.2.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.2.attn_q.q_output:rscale" = #stream.parameter.named<"model"::"blk.2.attn_q.q_output:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.2.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.2.attn_k.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.2.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.2.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.2.attn_k.q_output:rscale" = #stream.parameter.named<"model"::"blk.2.attn_k.q_output:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.2.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.2.attn_v.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.2.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.2.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.2.attn_v.q_output:rscale" = #stream.parameter.named<"model"::"blk.2.attn_v.q_output:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.2.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.2.attn_output.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.2.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.2.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.2.attn_output.weight:d" = #stream.parameter.named<"model"::"blk.2.attn_output.weight:d"> : tensor<f32> | |
util.global private @__auto.blk.2.ffn_norm.weight = #stream.parameter.named<"model"::"blk.2.ffn_norm.weight"> : tensor<4096xbf16> | |
util.global private @"__auto.blk.2.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.2.ffn_gate.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.2.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.2.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.2.ffn_gate.weight:d" = #stream.parameter.named<"model"::"blk.2.ffn_gate.weight:d"> : tensor<f32> | |
util.global private @"__auto.blk.2.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.2.ffn_up.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.2.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.2.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.2.ffn_up.weight:d" = #stream.parameter.named<"model"::"blk.2.ffn_up.weight:d"> : tensor<f32> | |
util.global private @"__auto.blk.2.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.2.ffn_down.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.2.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.2.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ> | |
util.global private @"__auto.blk.2.ffn_down.weight:d" = #stream.parameter.named<"model"::"blk.2.ffn_down.weight:d"> : tensor<f32> | |
util.global private @__auto.blk.3.attn_norm.weight = #stream.parameter.named<"model"::"blk.3.attn_norm.weight"> : tensor<4096xbf16> | |
util.global private @"__auto.blk.3.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.3.attn_q.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.3.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.3.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.3.attn_q.q_output:rscale" = #stream.parameter.named<"model"::"blk.3.attn_q.q_output:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.3.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.3.attn_k.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.3.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.3.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.3.attn_k.q_output:rscale" = #stream.parameter.named<"model"::"blk.3.attn_k.q_output:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.3.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.3.attn_v.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.3.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.3.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.3.attn_v.q_output:rscale" = #stream.parameter.named<"model"::"blk.3.attn_v.q_output:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.3.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.3.attn_output.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.3.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.3.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.3.attn_output.weight:d" = #stream.parameter.named<"model"::"blk.3.attn_output.weight:d"> : tensor<f32> | |
util.global private @__auto.blk.3.ffn_norm.weight = #stream.parameter.named<"model"::"blk.3.ffn_norm.weight"> : tensor<4096xbf16> | |
util.global private @"__auto.blk.3.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.3.ffn_gate.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.3.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.3.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.3.ffn_gate.weight:d" = #stream.parameter.named<"model"::"blk.3.ffn_gate.weight:d"> : tensor<f32> | |
util.global private @"__auto.blk.3.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.3.ffn_up.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.3.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.3.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.3.ffn_up.weight:d" = #stream.parameter.named<"model"::"blk.3.ffn_up.weight:d"> : tensor<f32> | |
util.global private @"__auto.blk.3.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.3.ffn_down.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.3.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.3.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ> | |
util.global private @"__auto.blk.3.ffn_down.weight:d" = #stream.parameter.named<"model"::"blk.3.ffn_down.weight:d"> : tensor<f32> | |
util.global private @__auto.blk.4.attn_norm.weight = #stream.parameter.named<"model"::"blk.4.attn_norm.weight"> : tensor<4096xbf16> | |
util.global private @"__auto.blk.4.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.4.attn_q.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.4.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.4.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.4.attn_q.q_output:rscale" = #stream.parameter.named<"model"::"blk.4.attn_q.q_output:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.4.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.4.attn_k.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.4.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.4.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.4.attn_k.q_output:rscale" = #stream.parameter.named<"model"::"blk.4.attn_k.q_output:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.4.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.4.attn_v.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.4.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.4.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.4.attn_v.q_output:rscale" = #stream.parameter.named<"model"::"blk.4.attn_v.q_output:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.4.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.4.attn_output.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.4.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.4.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.4.attn_output.weight:d" = #stream.parameter.named<"model"::"blk.4.attn_output.weight:d"> : tensor<f32> | |
util.global private @__auto.blk.4.ffn_norm.weight = #stream.parameter.named<"model"::"blk.4.ffn_norm.weight"> : tensor<4096xbf16> | |
util.global private @"__auto.blk.4.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.4.ffn_gate.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.4.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.4.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.4.ffn_gate.weight:d" = #stream.parameter.named<"model"::"blk.4.ffn_gate.weight:d"> : tensor<f32> | |
util.global private @"__auto.blk.4.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.4.ffn_up.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.4.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.4.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.4.ffn_up.weight:d" = #stream.parameter.named<"model"::"blk.4.ffn_up.weight:d"> : tensor<f32> | |
util.global private @"__auto.blk.4.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.4.ffn_down.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.4.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.4.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ> | |
util.global private @"__auto.blk.4.ffn_down.weight:d" = #stream.parameter.named<"model"::"blk.4.ffn_down.weight:d"> : tensor<f32> | |
util.global private @__auto.blk.5.attn_norm.weight = #stream.parameter.named<"model"::"blk.5.attn_norm.weight"> : tensor<4096xbf16> | |
util.global private @"__auto.blk.5.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.5.attn_q.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.5.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.5.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.5.attn_q.q_output:rscale" = #stream.parameter.named<"model"::"blk.5.attn_q.q_output:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.5.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.5.attn_k.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.5.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.5.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.5.attn_k.q_output:rscale" = #stream.parameter.named<"model"::"blk.5.attn_k.q_output:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.5.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.5.attn_v.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.5.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.5.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.5.attn_v.q_output:rscale" = #stream.parameter.named<"model"::"blk.5.attn_v.q_output:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.5.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.5.attn_output.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.5.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.5.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.5.attn_output.weight:d" = #stream.parameter.named<"model"::"blk.5.attn_output.weight:d"> : tensor<f32> | |
util.global private @__auto.blk.5.ffn_norm.weight = #stream.parameter.named<"model"::"blk.5.ffn_norm.weight"> : tensor<4096xbf16> | |
util.global private @"__auto.blk.5.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.5.ffn_gate.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.5.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.5.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.5.ffn_gate.weight:d" = #stream.parameter.named<"model"::"blk.5.ffn_gate.weight:d"> : tensor<f32> | |
util.global private @"__auto.blk.5.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.5.ffn_up.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.5.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.5.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.5.ffn_up.weight:d" = #stream.parameter.named<"model"::"blk.5.ffn_up.weight:d"> : tensor<f32> | |
util.global private @"__auto.blk.5.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.5.ffn_down.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.5.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.5.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ> | |
util.global private @"__auto.blk.5.ffn_down.weight:d" = #stream.parameter.named<"model"::"blk.5.ffn_down.weight:d"> : tensor<f32> | |
util.global private @__auto.blk.6.attn_norm.weight = #stream.parameter.named<"model"::"blk.6.attn_norm.weight"> : tensor<4096xbf16> | |
util.global private @"__auto.blk.6.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.6.attn_q.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.6.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.6.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.6.attn_q.q_output:rscale" = #stream.parameter.named<"model"::"blk.6.attn_q.q_output:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.6.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.6.attn_k.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.6.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.6.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.6.attn_k.q_output:rscale" = #stream.parameter.named<"model"::"blk.6.attn_k.q_output:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.6.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.6.attn_v.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.6.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.6.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.6.attn_v.q_output:rscale" = #stream.parameter.named<"model"::"blk.6.attn_v.q_output:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.6.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.6.attn_output.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.6.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.6.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.6.attn_output.weight:d" = #stream.parameter.named<"model"::"blk.6.attn_output.weight:d"> : tensor<f32> | |
util.global private @__auto.blk.6.ffn_norm.weight = #stream.parameter.named<"model"::"blk.6.ffn_norm.weight"> : tensor<4096xbf16> | |
util.global private @"__auto.blk.6.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.6.ffn_gate.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.6.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.6.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.6.ffn_gate.weight:d" = #stream.parameter.named<"model"::"blk.6.ffn_gate.weight:d"> : tensor<f32> | |
util.global private @"__auto.blk.6.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.6.ffn_up.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.6.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.6.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.6.ffn_up.weight:d" = #stream.parameter.named<"model"::"blk.6.ffn_up.weight:d"> : tensor<f32> | |
util.global private @"__auto.blk.6.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.6.ffn_down.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.6.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.6.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ> | |
util.global private @"__auto.blk.6.ffn_down.weight:d" = #stream.parameter.named<"model"::"blk.6.ffn_down.weight:d"> : tensor<f32> | |
util.global private @__auto.blk.7.attn_norm.weight = #stream.parameter.named<"model"::"blk.7.attn_norm.weight"> : tensor<4096xbf16> | |
util.global private @"__auto.blk.7.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.7.attn_q.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.7.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.7.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.7.attn_q.q_output:rscale" = #stream.parameter.named<"model"::"blk.7.attn_q.q_output:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.7.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.7.attn_k.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.7.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.7.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.7.attn_k.q_output:rscale" = #stream.parameter.named<"model"::"blk.7.attn_k.q_output:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.7.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.7.attn_v.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.7.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.7.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.7.attn_v.q_output:rscale" = #stream.parameter.named<"model"::"blk.7.attn_v.q_output:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.7.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.7.attn_output.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.7.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.7.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.7.attn_output.weight:d" = #stream.parameter.named<"model"::"blk.7.attn_output.weight:d"> : tensor<f32> | |
util.global private @__auto.blk.7.ffn_norm.weight = #stream.parameter.named<"model"::"blk.7.ffn_norm.weight"> : tensor<4096xbf16> | |
util.global private @"__auto.blk.7.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.7.ffn_gate.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.7.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.7.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.7.ffn_gate.weight:d" = #stream.parameter.named<"model"::"blk.7.ffn_gate.weight:d"> : tensor<f32> | |
util.global private @"__auto.blk.7.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.7.ffn_up.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.7.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.7.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.7.ffn_up.weight:d" = #stream.parameter.named<"model"::"blk.7.ffn_up.weight:d"> : tensor<f32> | |
util.global private @"__auto.blk.7.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.7.ffn_down.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.7.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.7.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ> | |
util.global private @"__auto.blk.7.ffn_down.weight:d" = #stream.parameter.named<"model"::"blk.7.ffn_down.weight:d"> : tensor<f32> | |
util.global private @__auto.blk.8.attn_norm.weight = #stream.parameter.named<"model"::"blk.8.attn_norm.weight"> : tensor<4096xbf16> | |
util.global private @"__auto.blk.8.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.8.attn_q.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.8.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.8.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.8.attn_q.q_output:rscale" = #stream.parameter.named<"model"::"blk.8.attn_q.q_output:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.8.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.8.attn_k.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.8.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.8.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.8.attn_k.q_output:rscale" = #stream.parameter.named<"model"::"blk.8.attn_k.q_output:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.8.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.8.attn_v.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.8.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.8.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.8.attn_v.q_output:rscale" = #stream.parameter.named<"model"::"blk.8.attn_v.q_output:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.8.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.8.attn_output.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.8.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.8.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.8.attn_output.weight:d" = #stream.parameter.named<"model"::"blk.8.attn_output.weight:d"> : tensor<f32> | |
util.global private @__auto.blk.8.ffn_norm.weight = #stream.parameter.named<"model"::"blk.8.ffn_norm.weight"> : tensor<4096xbf16> | |
util.global private @"__auto.blk.8.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.8.ffn_gate.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.8.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.8.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.8.ffn_gate.weight:d" = #stream.parameter.named<"model"::"blk.8.ffn_gate.weight:d"> : tensor<f32> | |
util.global private @"__auto.blk.8.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.8.ffn_up.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.8.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.8.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.8.ffn_up.weight:d" = #stream.parameter.named<"model"::"blk.8.ffn_up.weight:d"> : tensor<f32> | |
util.global private @"__auto.blk.8.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.8.ffn_down.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.8.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.8.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ> | |
util.global private @"__auto.blk.8.ffn_down.weight:d" = #stream.parameter.named<"model"::"blk.8.ffn_down.weight:d"> : tensor<f32> | |
util.global private @__auto.blk.9.attn_norm.weight = #stream.parameter.named<"model"::"blk.9.attn_norm.weight"> : tensor<4096xbf16> | |
util.global private @"__auto.blk.9.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.9.attn_q.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.9.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.9.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.9.attn_q.q_output:rscale" = #stream.parameter.named<"model"::"blk.9.attn_q.q_output:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.9.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.9.attn_k.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.9.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.9.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.9.attn_k.q_output:rscale" = #stream.parameter.named<"model"::"blk.9.attn_k.q_output:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.9.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.9.attn_v.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.9.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.9.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.9.attn_v.q_output:rscale" = #stream.parameter.named<"model"::"blk.9.attn_v.q_output:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.9.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.9.attn_output.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.9.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.9.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.9.attn_output.weight:d" = #stream.parameter.named<"model"::"blk.9.attn_output.weight:d"> : tensor<f32> | |
util.global private @__auto.blk.9.ffn_norm.weight = #stream.parameter.named<"model"::"blk.9.ffn_norm.weight"> : tensor<4096xbf16> | |
util.global private @"__auto.blk.9.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.9.ffn_gate.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.9.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.9.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.9.ffn_gate.weight:d" = #stream.parameter.named<"model"::"blk.9.ffn_gate.weight:d"> : tensor<f32> | |
util.global private @"__auto.blk.9.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.9.ffn_up.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.9.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.9.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.9.ffn_up.weight:d" = #stream.parameter.named<"model"::"blk.9.ffn_up.weight:d"> : tensor<f32> | |
util.global private @"__auto.blk.9.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.9.ffn_down.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.9.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.9.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ> | |
util.global private @"__auto.blk.9.ffn_down.weight:d" = #stream.parameter.named<"model"::"blk.9.ffn_down.weight:d"> : tensor<f32> | |
util.global private @__auto.blk.10.attn_norm.weight = #stream.parameter.named<"model"::"blk.10.attn_norm.weight"> : tensor<4096xbf16> | |
util.global private @"__auto.blk.10.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.10.attn_q.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.10.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.10.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.10.attn_q.q_output:rscale" = #stream.parameter.named<"model"::"blk.10.attn_q.q_output:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.10.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.10.attn_k.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.10.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.10.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.10.attn_k.q_output:rscale" = #stream.parameter.named<"model"::"blk.10.attn_k.q_output:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.10.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.10.attn_v.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.10.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.10.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.10.attn_v.q_output:rscale" = #stream.parameter.named<"model"::"blk.10.attn_v.q_output:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.10.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.10.attn_output.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.10.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.10.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.10.attn_output.weight:d" = #stream.parameter.named<"model"::"blk.10.attn_output.weight:d"> : tensor<f32> | |
util.global private @__auto.blk.10.ffn_norm.weight = #stream.parameter.named<"model"::"blk.10.ffn_norm.weight"> : tensor<4096xbf16> | |
util.global private @"__auto.blk.10.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.10.ffn_gate.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.10.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.10.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.10.ffn_gate.weight:d" = #stream.parameter.named<"model"::"blk.10.ffn_gate.weight:d"> : tensor<f32> | |
util.global private @"__auto.blk.10.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.10.ffn_up.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.10.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.10.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.10.ffn_up.weight:d" = #stream.parameter.named<"model"::"blk.10.ffn_up.weight:d"> : tensor<f32> | |
util.global private @"__auto.blk.10.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.10.ffn_down.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.10.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.10.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ> | |
util.global private @"__auto.blk.10.ffn_down.weight:d" = #stream.parameter.named<"model"::"blk.10.ffn_down.weight:d"> : tensor<f32> | |
util.global private @__auto.blk.11.attn_norm.weight = #stream.parameter.named<"model"::"blk.11.attn_norm.weight"> : tensor<4096xbf16> | |
util.global private @"__auto.blk.11.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.11.attn_q.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.11.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.11.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.11.attn_q.q_output:rscale" = #stream.parameter.named<"model"::"blk.11.attn_q.q_output:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.11.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.11.attn_k.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.11.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.11.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.11.attn_k.q_output:rscale" = #stream.parameter.named<"model"::"blk.11.attn_k.q_output:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.11.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.11.attn_v.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.11.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.11.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.11.attn_v.q_output:rscale" = #stream.parameter.named<"model"::"blk.11.attn_v.q_output:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.11.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.11.attn_output.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.11.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.11.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.11.attn_output.weight:d" = #stream.parameter.named<"model"::"blk.11.attn_output.weight:d"> : tensor<f32> | |
util.global private @__auto.blk.11.ffn_norm.weight = #stream.parameter.named<"model"::"blk.11.ffn_norm.weight"> : tensor<4096xbf16> | |
util.global private @"__auto.blk.11.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.11.ffn_gate.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.11.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.11.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.11.ffn_gate.weight:d" = #stream.parameter.named<"model"::"blk.11.ffn_gate.weight:d"> : tensor<f32> | |
util.global private @"__auto.blk.11.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.11.ffn_up.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.11.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.11.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.11.ffn_up.weight:d" = #stream.parameter.named<"model"::"blk.11.ffn_up.weight:d"> : tensor<f32> | |
util.global private @"__auto.blk.11.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.11.ffn_down.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.11.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.11.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ> | |
util.global private @"__auto.blk.11.ffn_down.weight:d" = #stream.parameter.named<"model"::"blk.11.ffn_down.weight:d"> : tensor<f32> | |
util.global private @__auto.blk.12.attn_norm.weight = #stream.parameter.named<"model"::"blk.12.attn_norm.weight"> : tensor<4096xbf16> | |
util.global private @"__auto.blk.12.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.12.attn_q.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.12.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.12.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.12.attn_q.q_output:rscale" = #stream.parameter.named<"model"::"blk.12.attn_q.q_output:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.12.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.12.attn_k.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.12.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.12.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.12.attn_k.q_output:rscale" = #stream.parameter.named<"model"::"blk.12.attn_k.q_output:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.12.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.12.attn_v.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.12.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.12.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.12.attn_v.q_output:rscale" = #stream.parameter.named<"model"::"blk.12.attn_v.q_output:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.12.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.12.attn_output.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.12.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.12.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.12.attn_output.weight:d" = #stream.parameter.named<"model"::"blk.12.attn_output.weight:d"> : tensor<f32> | |
util.global private @__auto.blk.12.ffn_norm.weight = #stream.parameter.named<"model"::"blk.12.ffn_norm.weight"> : tensor<4096xbf16> | |
util.global private @"__auto.blk.12.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.12.ffn_gate.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.12.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.12.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.12.ffn_gate.weight:d" = #stream.parameter.named<"model"::"blk.12.ffn_gate.weight:d"> : tensor<f32> | |
util.global private @"__auto.blk.12.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.12.ffn_up.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.12.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.12.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.12.ffn_up.weight:d" = #stream.parameter.named<"model"::"blk.12.ffn_up.weight:d"> : tensor<f32> | |
util.global private @"__auto.blk.12.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.12.ffn_down.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.12.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.12.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ> | |
util.global private @"__auto.blk.12.ffn_down.weight:d" = #stream.parameter.named<"model"::"blk.12.ffn_down.weight:d"> : tensor<f32> | |
util.global private @__auto.blk.13.attn_norm.weight = #stream.parameter.named<"model"::"blk.13.attn_norm.weight"> : tensor<4096xbf16> | |
util.global private @"__auto.blk.13.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.13.attn_q.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.13.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.13.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.13.attn_q.q_output:rscale" = #stream.parameter.named<"model"::"blk.13.attn_q.q_output:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.13.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.13.attn_k.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.13.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.13.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.13.attn_k.q_output:rscale" = #stream.parameter.named<"model"::"blk.13.attn_k.q_output:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.13.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.13.attn_v.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.13.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.13.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.13.attn_v.q_output:rscale" = #stream.parameter.named<"model"::"blk.13.attn_v.q_output:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.13.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.13.attn_output.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.13.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.13.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.13.attn_output.weight:d" = #stream.parameter.named<"model"::"blk.13.attn_output.weight:d"> : tensor<f32> | |
util.global private @__auto.blk.13.ffn_norm.weight = #stream.parameter.named<"model"::"blk.13.ffn_norm.weight"> : tensor<4096xbf16> | |
util.global private @"__auto.blk.13.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.13.ffn_gate.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.13.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.13.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.13.ffn_gate.weight:d" = #stream.parameter.named<"model"::"blk.13.ffn_gate.weight:d"> : tensor<f32> | |
util.global private @"__auto.blk.13.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.13.ffn_up.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.13.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.13.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.13.ffn_up.weight:d" = #stream.parameter.named<"model"::"blk.13.ffn_up.weight:d"> : tensor<f32> | |
util.global private @"__auto.blk.13.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.13.ffn_down.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.13.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.13.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ> | |
util.global private @"__auto.blk.13.ffn_down.weight:d" = #stream.parameter.named<"model"::"blk.13.ffn_down.weight:d"> : tensor<f32> | |
util.global private @__auto.blk.14.attn_norm.weight = #stream.parameter.named<"model"::"blk.14.attn_norm.weight"> : tensor<4096xbf16> | |
util.global private @"__auto.blk.14.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.14.attn_q.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.14.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.14.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.14.attn_q.q_output:rscale" = #stream.parameter.named<"model"::"blk.14.attn_q.q_output:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.14.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.14.attn_k.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.14.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.14.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.14.attn_k.q_output:rscale" = #stream.parameter.named<"model"::"blk.14.attn_k.q_output:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.14.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.14.attn_v.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.14.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.14.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.14.attn_v.q_output:rscale" = #stream.parameter.named<"model"::"blk.14.attn_v.q_output:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.14.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.14.attn_output.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.14.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.14.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.14.attn_output.weight:d" = #stream.parameter.named<"model"::"blk.14.attn_output.weight:d"> : tensor<f32> | |
util.global private @__auto.blk.14.ffn_norm.weight = #stream.parameter.named<"model"::"blk.14.ffn_norm.weight"> : tensor<4096xbf16> | |
util.global private @"__auto.blk.14.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.14.ffn_gate.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.14.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.14.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.14.ffn_gate.weight:d" = #stream.parameter.named<"model"::"blk.14.ffn_gate.weight:d"> : tensor<f32> | |
util.global private @"__auto.blk.14.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.14.ffn_up.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.14.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.14.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.14.ffn_up.weight:d" = #stream.parameter.named<"model"::"blk.14.ffn_up.weight:d"> : tensor<f32> | |
util.global private @"__auto.blk.14.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.14.ffn_down.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.14.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.14.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ> | |
util.global private @"__auto.blk.14.ffn_down.weight:d" = #stream.parameter.named<"model"::"blk.14.ffn_down.weight:d"> : tensor<f32> | |
util.global private @__auto.blk.15.attn_norm.weight = #stream.parameter.named<"model"::"blk.15.attn_norm.weight"> : tensor<4096xbf16> | |
util.global private @"__auto.blk.15.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.15.attn_q.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.15.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.15.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.15.attn_q.q_output:rscale" = #stream.parameter.named<"model"::"blk.15.attn_q.q_output:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.15.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.15.attn_k.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.15.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.15.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.15.attn_k.q_output:rscale" = #stream.parameter.named<"model"::"blk.15.attn_k.q_output:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.15.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.15.attn_v.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.15.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.15.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.15.attn_v.q_output:rscale" = #stream.parameter.named<"model"::"blk.15.attn_v.q_output:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.15.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.15.attn_output.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.15.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.15.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.15.attn_output.weight:d" = #stream.parameter.named<"model"::"blk.15.attn_output.weight:d"> : tensor<f32> | |
util.global private @__auto.blk.15.ffn_norm.weight = #stream.parameter.named<"model"::"blk.15.ffn_norm.weight"> : tensor<4096xbf16> | |
util.global private @"__auto.blk.15.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.15.ffn_gate.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.15.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.15.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.15.ffn_gate.weight:d" = #stream.parameter.named<"model"::"blk.15.ffn_gate.weight:d"> : tensor<f32> | |
util.global private @"__auto.blk.15.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.15.ffn_up.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.15.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.15.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.15.ffn_up.weight:d" = #stream.parameter.named<"model"::"blk.15.ffn_up.weight:d"> : tensor<f32> | |
util.global private @"__auto.blk.15.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.15.ffn_down.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.15.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.15.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ> | |
util.global private @"__auto.blk.15.ffn_down.weight:d" = #stream.parameter.named<"model"::"blk.15.ffn_down.weight:d"> : tensor<f32> | |
util.global private @__auto.blk.16.attn_norm.weight = #stream.parameter.named<"model"::"blk.16.attn_norm.weight"> : tensor<4096xbf16> | |
util.global private @"__auto.blk.16.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.16.attn_q.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.16.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.16.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.16.attn_q.q_output:rscale" = #stream.parameter.named<"model"::"blk.16.attn_q.q_output:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.16.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.16.attn_k.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.16.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.16.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.16.attn_k.q_output:rscale" = #stream.parameter.named<"model"::"blk.16.attn_k.q_output:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.16.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.16.attn_v.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.16.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.16.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.16.attn_v.q_output:rscale" = #stream.parameter.named<"model"::"blk.16.attn_v.q_output:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.16.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.16.attn_output.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.16.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.16.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.16.attn_output.weight:d" = #stream.parameter.named<"model"::"blk.16.attn_output.weight:d"> : tensor<f32> | |
util.global private @__auto.blk.16.ffn_norm.weight = #stream.parameter.named<"model"::"blk.16.ffn_norm.weight"> : tensor<4096xbf16> | |
util.global private @"__auto.blk.16.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.16.ffn_gate.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.16.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.16.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.16.ffn_gate.weight:d" = #stream.parameter.named<"model"::"blk.16.ffn_gate.weight:d"> : tensor<f32> | |
util.global private @"__auto.blk.16.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.16.ffn_up.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.16.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.16.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.16.ffn_up.weight:d" = #stream.parameter.named<"model"::"blk.16.ffn_up.weight:d"> : tensor<f32> | |
util.global private @"__auto.blk.16.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.16.ffn_down.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.16.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.16.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ> | |
util.global private @"__auto.blk.16.ffn_down.weight:d" = #stream.parameter.named<"model"::"blk.16.ffn_down.weight:d"> : tensor<f32> | |
util.global private @__auto.blk.17.attn_norm.weight = #stream.parameter.named<"model"::"blk.17.attn_norm.weight"> : tensor<4096xbf16> | |
util.global private @"__auto.blk.17.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.17.attn_q.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.17.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.17.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.17.attn_q.q_output:rscale" = #stream.parameter.named<"model"::"blk.17.attn_q.q_output:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.17.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.17.attn_k.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.17.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.17.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.17.attn_k.q_output:rscale" = #stream.parameter.named<"model"::"blk.17.attn_k.q_output:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.17.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.17.attn_v.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.17.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.17.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.17.attn_v.q_output:rscale" = #stream.parameter.named<"model"::"blk.17.attn_v.q_output:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.17.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.17.attn_output.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.17.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.17.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.17.attn_output.weight:d" = #stream.parameter.named<"model"::"blk.17.attn_output.weight:d"> : tensor<f32> | |
util.global private @__auto.blk.17.ffn_norm.weight = #stream.parameter.named<"model"::"blk.17.ffn_norm.weight"> : tensor<4096xbf16> | |
util.global private @"__auto.blk.17.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.17.ffn_gate.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.17.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.17.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.17.ffn_gate.weight:d" = #stream.parameter.named<"model"::"blk.17.ffn_gate.weight:d"> : tensor<f32> | |
util.global private @"__auto.blk.17.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.17.ffn_up.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.17.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.17.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.17.ffn_up.weight:d" = #stream.parameter.named<"model"::"blk.17.ffn_up.weight:d"> : tensor<f32> | |
util.global private @"__auto.blk.17.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.17.ffn_down.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.17.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.17.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ> | |
util.global private @"__auto.blk.17.ffn_down.weight:d" = #stream.parameter.named<"model"::"blk.17.ffn_down.weight:d"> : tensor<f32> | |
util.global private @__auto.blk.18.attn_norm.weight = #stream.parameter.named<"model"::"blk.18.attn_norm.weight"> : tensor<4096xbf16> | |
util.global private @"__auto.blk.18.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.18.attn_q.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.18.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.18.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.18.attn_q.q_output:rscale" = #stream.parameter.named<"model"::"blk.18.attn_q.q_output:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.18.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.18.attn_k.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.18.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.18.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.18.attn_k.q_output:rscale" = #stream.parameter.named<"model"::"blk.18.attn_k.q_output:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.18.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.18.attn_v.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.18.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.18.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.18.attn_v.q_output:rscale" = #stream.parameter.named<"model"::"blk.18.attn_v.q_output:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.18.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.18.attn_output.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.18.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.18.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.18.attn_output.weight:d" = #stream.parameter.named<"model"::"blk.18.attn_output.weight:d"> : tensor<f32> | |
util.global private @__auto.blk.18.ffn_norm.weight = #stream.parameter.named<"model"::"blk.18.ffn_norm.weight"> : tensor<4096xbf16> | |
util.global private @"__auto.blk.18.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.18.ffn_gate.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.18.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.18.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.18.ffn_gate.weight:d" = #stream.parameter.named<"model"::"blk.18.ffn_gate.weight:d"> : tensor<f32> | |
util.global private @"__auto.blk.18.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.18.ffn_up.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.18.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.18.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.18.ffn_up.weight:d" = #stream.parameter.named<"model"::"blk.18.ffn_up.weight:d"> : tensor<f32> | |
util.global private @"__auto.blk.18.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.18.ffn_down.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.18.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.18.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ> | |
util.global private @"__auto.blk.18.ffn_down.weight:d" = #stream.parameter.named<"model"::"blk.18.ffn_down.weight:d"> : tensor<f32> | |
util.global private @__auto.blk.19.attn_norm.weight = #stream.parameter.named<"model"::"blk.19.attn_norm.weight"> : tensor<4096xbf16> | |
util.global private @"__auto.blk.19.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.19.attn_q.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.19.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.19.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.19.attn_q.q_output:rscale" = #stream.parameter.named<"model"::"blk.19.attn_q.q_output:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.19.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.19.attn_k.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.19.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.19.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.19.attn_k.q_output:rscale" = #stream.parameter.named<"model"::"blk.19.attn_k.q_output:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.19.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.19.attn_v.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.19.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.19.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.19.attn_v.q_output:rscale" = #stream.parameter.named<"model"::"blk.19.attn_v.q_output:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.19.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.19.attn_output.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.19.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.19.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.19.attn_output.weight:d" = #stream.parameter.named<"model"::"blk.19.attn_output.weight:d"> : tensor<f32> | |
util.global private @__auto.blk.19.ffn_norm.weight = #stream.parameter.named<"model"::"blk.19.ffn_norm.weight"> : tensor<4096xbf16> | |
util.global private @"__auto.blk.19.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.19.ffn_gate.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.19.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.19.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.19.ffn_gate.weight:d" = #stream.parameter.named<"model"::"blk.19.ffn_gate.weight:d"> : tensor<f32> | |
util.global private @"__auto.blk.19.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.19.ffn_up.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.19.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.19.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.19.ffn_up.weight:d" = #stream.parameter.named<"model"::"blk.19.ffn_up.weight:d"> : tensor<f32> | |
util.global private @"__auto.blk.19.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.19.ffn_down.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.19.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.19.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ> | |
util.global private @"__auto.blk.19.ffn_down.weight:d" = #stream.parameter.named<"model"::"blk.19.ffn_down.weight:d"> : tensor<f32> | |
util.global private @__auto.blk.20.attn_norm.weight = #stream.parameter.named<"model"::"blk.20.attn_norm.weight"> : tensor<4096xbf16> | |
util.global private @"__auto.blk.20.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.20.attn_q.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.20.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.20.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.20.attn_q.q_output:rscale" = #stream.parameter.named<"model"::"blk.20.attn_q.q_output:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.20.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.20.attn_k.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.20.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.20.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.20.attn_k.q_output:rscale" = #stream.parameter.named<"model"::"blk.20.attn_k.q_output:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.20.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.20.attn_v.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.20.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.20.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.20.attn_v.q_output:rscale" = #stream.parameter.named<"model"::"blk.20.attn_v.q_output:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.20.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.20.attn_output.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.20.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.20.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.20.attn_output.weight:d" = #stream.parameter.named<"model"::"blk.20.attn_output.weight:d"> : tensor<f32> | |
util.global private @__auto.blk.20.ffn_norm.weight = #stream.parameter.named<"model"::"blk.20.ffn_norm.weight"> : tensor<4096xbf16> | |
util.global private @"__auto.blk.20.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.20.ffn_gate.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.20.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.20.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.20.ffn_gate.weight:d" = #stream.parameter.named<"model"::"blk.20.ffn_gate.weight:d"> : tensor<f32> | |
util.global private @"__auto.blk.20.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.20.ffn_up.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.20.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.20.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.20.ffn_up.weight:d" = #stream.parameter.named<"model"::"blk.20.ffn_up.weight:d"> : tensor<f32> | |
util.global private @"__auto.blk.20.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.20.ffn_down.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.20.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.20.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ> | |
util.global private @"__auto.blk.20.ffn_down.weight:d" = #stream.parameter.named<"model"::"blk.20.ffn_down.weight:d"> : tensor<f32> | |
util.global private @__auto.blk.21.attn_norm.weight = #stream.parameter.named<"model"::"blk.21.attn_norm.weight"> : tensor<4096xbf16> | |
util.global private @"__auto.blk.21.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.21.attn_q.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.21.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.21.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.21.attn_q.q_output:rscale" = #stream.parameter.named<"model"::"blk.21.attn_q.q_output:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.21.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.21.attn_k.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.21.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.21.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.21.attn_k.q_output:rscale" = #stream.parameter.named<"model"::"blk.21.attn_k.q_output:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.21.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.21.attn_v.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.21.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.21.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.21.attn_v.q_output:rscale" = #stream.parameter.named<"model"::"blk.21.attn_v.q_output:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.21.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.21.attn_output.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.21.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.21.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.21.attn_output.weight:d" = #stream.parameter.named<"model"::"blk.21.attn_output.weight:d"> : tensor<f32> | |
util.global private @__auto.blk.21.ffn_norm.weight = #stream.parameter.named<"model"::"blk.21.ffn_norm.weight"> : tensor<4096xbf16> | |
util.global private @"__auto.blk.21.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.21.ffn_gate.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.21.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.21.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.21.ffn_gate.weight:d" = #stream.parameter.named<"model"::"blk.21.ffn_gate.weight:d"> : tensor<f32> | |
util.global private @"__auto.blk.21.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.21.ffn_up.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.21.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.21.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.21.ffn_up.weight:d" = #stream.parameter.named<"model"::"blk.21.ffn_up.weight:d"> : tensor<f32> | |
util.global private @"__auto.blk.21.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.21.ffn_down.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.21.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.21.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ> | |
util.global private @"__auto.blk.21.ffn_down.weight:d" = #stream.parameter.named<"model"::"blk.21.ffn_down.weight:d"> : tensor<f32> | |
util.global private @__auto.blk.22.attn_norm.weight = #stream.parameter.named<"model"::"blk.22.attn_norm.weight"> : tensor<4096xbf16> | |
util.global private @"__auto.blk.22.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.22.attn_q.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.22.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.22.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.22.attn_q.q_output:rscale" = #stream.parameter.named<"model"::"blk.22.attn_q.q_output:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.22.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.22.attn_k.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.22.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.22.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.22.attn_k.q_output:rscale" = #stream.parameter.named<"model"::"blk.22.attn_k.q_output:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.22.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.22.attn_v.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.22.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.22.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.22.attn_v.q_output:rscale" = #stream.parameter.named<"model"::"blk.22.attn_v.q_output:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.22.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.22.attn_output.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.22.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.22.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.22.attn_output.weight:d" = #stream.parameter.named<"model"::"blk.22.attn_output.weight:d"> : tensor<f32> | |
util.global private @__auto.blk.22.ffn_norm.weight = #stream.parameter.named<"model"::"blk.22.ffn_norm.weight"> : tensor<4096xbf16> | |
util.global private @"__auto.blk.22.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.22.ffn_gate.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.22.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.22.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.22.ffn_gate.weight:d" = #stream.parameter.named<"model"::"blk.22.ffn_gate.weight:d"> : tensor<f32> | |
util.global private @"__auto.blk.22.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.22.ffn_up.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.22.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.22.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.22.ffn_up.weight:d" = #stream.parameter.named<"model"::"blk.22.ffn_up.weight:d"> : tensor<f32> | |
util.global private @"__auto.blk.22.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.22.ffn_down.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.22.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.22.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ> | |
util.global private @"__auto.blk.22.ffn_down.weight:d" = #stream.parameter.named<"model"::"blk.22.ffn_down.weight:d"> : tensor<f32> | |
util.global private @__auto.blk.23.attn_norm.weight = #stream.parameter.named<"model"::"blk.23.attn_norm.weight"> : tensor<4096xbf16> | |
util.global private @"__auto.blk.23.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.23.attn_q.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.23.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.23.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.23.attn_q.q_output:rscale" = #stream.parameter.named<"model"::"blk.23.attn_q.q_output:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.23.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.23.attn_k.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.23.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.23.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.23.attn_k.q_output:rscale" = #stream.parameter.named<"model"::"blk.23.attn_k.q_output:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.23.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.23.attn_v.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.23.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.23.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.23.attn_v.q_output:rscale" = #stream.parameter.named<"model"::"blk.23.attn_v.q_output:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.23.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.23.attn_output.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.23.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.23.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.23.attn_output.weight:d" = #stream.parameter.named<"model"::"blk.23.attn_output.weight:d"> : tensor<f32> | |
util.global private @__auto.blk.23.ffn_norm.weight = #stream.parameter.named<"model"::"blk.23.ffn_norm.weight"> : tensor<4096xbf16> | |
util.global private @"__auto.blk.23.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.23.ffn_gate.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.23.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.23.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.23.ffn_gate.weight:d" = #stream.parameter.named<"model"::"blk.23.ffn_gate.weight:d"> : tensor<f32> | |
util.global private @"__auto.blk.23.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.23.ffn_up.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.23.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.23.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.23.ffn_up.weight:d" = #stream.parameter.named<"model"::"blk.23.ffn_up.weight:d"> : tensor<f32> | |
util.global private @"__auto.blk.23.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.23.ffn_down.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.23.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.23.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ> | |
util.global private @"__auto.blk.23.ffn_down.weight:d" = #stream.parameter.named<"model"::"blk.23.ffn_down.weight:d"> : tensor<f32> | |
util.global private @__auto.blk.24.attn_norm.weight = #stream.parameter.named<"model"::"blk.24.attn_norm.weight"> : tensor<4096xbf16> | |
util.global private @"__auto.blk.24.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.24.attn_q.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.24.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.24.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.24.attn_q.q_output:rscale" = #stream.parameter.named<"model"::"blk.24.attn_q.q_output:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.24.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.24.attn_k.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.24.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.24.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.24.attn_k.q_output:rscale" = #stream.parameter.named<"model"::"blk.24.attn_k.q_output:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.24.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.24.attn_v.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.24.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.24.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.24.attn_v.q_output:rscale" = #stream.parameter.named<"model"::"blk.24.attn_v.q_output:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.24.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.24.attn_output.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.24.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.24.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.24.attn_output.weight:d" = #stream.parameter.named<"model"::"blk.24.attn_output.weight:d"> : tensor<f32> | |
util.global private @__auto.blk.24.ffn_norm.weight = #stream.parameter.named<"model"::"blk.24.ffn_norm.weight"> : tensor<4096xbf16> | |
util.global private @"__auto.blk.24.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.24.ffn_gate.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.24.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.24.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.24.ffn_gate.weight:d" = #stream.parameter.named<"model"::"blk.24.ffn_gate.weight:d"> : tensor<f32> | |
util.global private @"__auto.blk.24.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.24.ffn_up.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.24.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.24.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.24.ffn_up.weight:d" = #stream.parameter.named<"model"::"blk.24.ffn_up.weight:d"> : tensor<f32> | |
util.global private @"__auto.blk.24.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.24.ffn_down.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.24.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.24.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ> | |
util.global private @"__auto.blk.24.ffn_down.weight:d" = #stream.parameter.named<"model"::"blk.24.ffn_down.weight:d"> : tensor<f32> | |
util.global private @__auto.blk.25.attn_norm.weight = #stream.parameter.named<"model"::"blk.25.attn_norm.weight"> : tensor<4096xbf16> | |
util.global private @"__auto.blk.25.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.25.attn_q.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.25.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.25.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.25.attn_q.q_output:rscale" = #stream.parameter.named<"model"::"blk.25.attn_q.q_output:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.25.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.25.attn_k.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.25.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.25.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.25.attn_k.q_output:rscale" = #stream.parameter.named<"model"::"blk.25.attn_k.q_output:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.25.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.25.attn_v.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.25.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.25.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.25.attn_v.q_output:rscale" = #stream.parameter.named<"model"::"blk.25.attn_v.q_output:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.25.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.25.attn_output.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.25.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.25.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.25.attn_output.weight:d" = #stream.parameter.named<"model"::"blk.25.attn_output.weight:d"> : tensor<f32> | |
util.global private @__auto.blk.25.ffn_norm.weight = #stream.parameter.named<"model"::"blk.25.ffn_norm.weight"> : tensor<4096xbf16> | |
util.global private @"__auto.blk.25.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.25.ffn_gate.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.25.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.25.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.25.ffn_gate.weight:d" = #stream.parameter.named<"model"::"blk.25.ffn_gate.weight:d"> : tensor<f32> | |
util.global private @"__auto.blk.25.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.25.ffn_up.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.25.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.25.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.25.ffn_up.weight:d" = #stream.parameter.named<"model"::"blk.25.ffn_up.weight:d"> : tensor<f32> | |
util.global private @"__auto.blk.25.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.25.ffn_down.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.25.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.25.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ> | |
util.global private @"__auto.blk.25.ffn_down.weight:d" = #stream.parameter.named<"model"::"blk.25.ffn_down.weight:d"> : tensor<f32> | |
util.global private @__auto.blk.26.attn_norm.weight = #stream.parameter.named<"model"::"blk.26.attn_norm.weight"> : tensor<4096xbf16> | |
util.global private @"__auto.blk.26.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.26.attn_q.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.26.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.26.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.26.attn_q.q_output:rscale" = #stream.parameter.named<"model"::"blk.26.attn_q.q_output:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.26.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.26.attn_k.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.26.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.26.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.26.attn_k.q_output:rscale" = #stream.parameter.named<"model"::"blk.26.attn_k.q_output:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.26.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.26.attn_v.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.26.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.26.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.26.attn_v.q_output:rscale" = #stream.parameter.named<"model"::"blk.26.attn_v.q_output:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.26.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.26.attn_output.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.26.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.26.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.26.attn_output.weight:d" = #stream.parameter.named<"model"::"blk.26.attn_output.weight:d"> : tensor<f32> | |
util.global private @__auto.blk.26.ffn_norm.weight = #stream.parameter.named<"model"::"blk.26.ffn_norm.weight"> : tensor<4096xbf16> | |
util.global private @"__auto.blk.26.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.26.ffn_gate.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.26.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.26.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.26.ffn_gate.weight:d" = #stream.parameter.named<"model"::"blk.26.ffn_gate.weight:d"> : tensor<f32> | |
util.global private @"__auto.blk.26.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.26.ffn_up.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.26.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.26.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.26.ffn_up.weight:d" = #stream.parameter.named<"model"::"blk.26.ffn_up.weight:d"> : tensor<f32> | |
util.global private @"__auto.blk.26.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.26.ffn_down.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.26.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.26.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ> | |
util.global private @"__auto.blk.26.ffn_down.weight:d" = #stream.parameter.named<"model"::"blk.26.ffn_down.weight:d"> : tensor<f32> | |
util.global private @__auto.blk.27.attn_norm.weight = #stream.parameter.named<"model"::"blk.27.attn_norm.weight"> : tensor<4096xbf16> | |
util.global private @"__auto.blk.27.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.27.attn_q.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.27.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.27.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.27.attn_q.q_output:rscale" = #stream.parameter.named<"model"::"blk.27.attn_q.q_output:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.27.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.27.attn_k.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.27.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.27.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.27.attn_k.q_output:rscale" = #stream.parameter.named<"model"::"blk.27.attn_k.q_output:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.27.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.27.attn_v.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.27.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.27.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.27.attn_v.q_output:rscale" = #stream.parameter.named<"model"::"blk.27.attn_v.q_output:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.27.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.27.attn_output.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.27.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.27.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.27.attn_output.weight:d" = #stream.parameter.named<"model"::"blk.27.attn_output.weight:d"> : tensor<f32> | |
util.global private @__auto.blk.27.ffn_norm.weight = #stream.parameter.named<"model"::"blk.27.ffn_norm.weight"> : tensor<4096xbf16> | |
util.global private @"__auto.blk.27.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.27.ffn_gate.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.27.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.27.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.27.ffn_gate.weight:d" = #stream.parameter.named<"model"::"blk.27.ffn_gate.weight:d"> : tensor<f32> | |
util.global private @"__auto.blk.27.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.27.ffn_up.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.27.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.27.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.27.ffn_up.weight:d" = #stream.parameter.named<"model"::"blk.27.ffn_up.weight:d"> : tensor<f32> | |
util.global private @"__auto.blk.27.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.27.ffn_down.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.27.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.27.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ> | |
util.global private @"__auto.blk.27.ffn_down.weight:d" = #stream.parameter.named<"model"::"blk.27.ffn_down.weight:d"> : tensor<f32> | |
util.global private @__auto.blk.28.attn_norm.weight = #stream.parameter.named<"model"::"blk.28.attn_norm.weight"> : tensor<4096xbf16> | |
util.global private @"__auto.blk.28.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.28.attn_q.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.28.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.28.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.28.attn_q.q_output:rscale" = #stream.parameter.named<"model"::"blk.28.attn_q.q_output:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.28.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.28.attn_k.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.28.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.28.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.28.attn_k.q_output:rscale" = #stream.parameter.named<"model"::"blk.28.attn_k.q_output:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.28.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.28.attn_v.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.28.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.28.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.28.attn_v.q_output:rscale" = #stream.parameter.named<"model"::"blk.28.attn_v.q_output:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.28.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.28.attn_output.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.28.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.28.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.28.attn_output.weight:d" = #stream.parameter.named<"model"::"blk.28.attn_output.weight:d"> : tensor<f32> | |
util.global private @__auto.blk.28.ffn_norm.weight = #stream.parameter.named<"model"::"blk.28.ffn_norm.weight"> : tensor<4096xbf16> | |
util.global private @"__auto.blk.28.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.28.ffn_gate.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.28.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.28.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.28.ffn_gate.weight:d" = #stream.parameter.named<"model"::"blk.28.ffn_gate.weight:d"> : tensor<f32> | |
util.global private @"__auto.blk.28.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.28.ffn_up.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.28.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.28.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.28.ffn_up.weight:d" = #stream.parameter.named<"model"::"blk.28.ffn_up.weight:d"> : tensor<f32> | |
util.global private @"__auto.blk.28.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.28.ffn_down.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.28.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.28.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ> | |
util.global private @"__auto.blk.28.ffn_down.weight:d" = #stream.parameter.named<"model"::"blk.28.ffn_down.weight:d"> : tensor<f32> | |
util.global private @__auto.blk.29.attn_norm.weight = #stream.parameter.named<"model"::"blk.29.attn_norm.weight"> : tensor<4096xbf16> | |
util.global private @"__auto.blk.29.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.29.attn_q.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.29.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.29.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.29.attn_q.q_output:rscale" = #stream.parameter.named<"model"::"blk.29.attn_q.q_output:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.29.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.29.attn_k.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.29.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.29.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.29.attn_k.q_output:rscale" = #stream.parameter.named<"model"::"blk.29.attn_k.q_output:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.29.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.29.attn_v.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.29.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.29.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.29.attn_v.q_output:rscale" = #stream.parameter.named<"model"::"blk.29.attn_v.q_output:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.29.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.29.attn_output.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.29.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.29.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.29.attn_output.weight:d" = #stream.parameter.named<"model"::"blk.29.attn_output.weight:d"> : tensor<f32> | |
util.global private @__auto.blk.29.ffn_norm.weight = #stream.parameter.named<"model"::"blk.29.ffn_norm.weight"> : tensor<4096xbf16> | |
util.global private @"__auto.blk.29.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.29.ffn_gate.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.29.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.29.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.29.ffn_gate.weight:d" = #stream.parameter.named<"model"::"blk.29.ffn_gate.weight:d"> : tensor<f32> | |
util.global private @"__auto.blk.29.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.29.ffn_up.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.29.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.29.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.29.ffn_up.weight:d" = #stream.parameter.named<"model"::"blk.29.ffn_up.weight:d"> : tensor<f32> | |
util.global private @"__auto.blk.29.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.29.ffn_down.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.29.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.29.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ> | |
util.global private @"__auto.blk.29.ffn_down.weight:d" = #stream.parameter.named<"model"::"blk.29.ffn_down.weight:d"> : tensor<f32> | |
util.global private @__auto.blk.30.attn_norm.weight = #stream.parameter.named<"model"::"blk.30.attn_norm.weight"> : tensor<4096xbf16> | |
util.global private @"__auto.blk.30.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.30.attn_q.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.30.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.30.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.30.attn_q.q_output:rscale" = #stream.parameter.named<"model"::"blk.30.attn_q.q_output:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.30.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.30.attn_k.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.30.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.30.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.30.attn_k.q_output:rscale" = #stream.parameter.named<"model"::"blk.30.attn_k.q_output:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.30.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.30.attn_v.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.30.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.30.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.30.attn_v.q_output:rscale" = #stream.parameter.named<"model"::"blk.30.attn_v.q_output:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.30.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.30.attn_output.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.30.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.30.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.30.attn_output.weight:d" = #stream.parameter.named<"model"::"blk.30.attn_output.weight:d"> : tensor<f32> | |
util.global private @__auto.blk.30.ffn_norm.weight = #stream.parameter.named<"model"::"blk.30.ffn_norm.weight"> : tensor<4096xbf16> | |
util.global private @"__auto.blk.30.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.30.ffn_gate.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.30.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.30.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.30.ffn_gate.weight:d" = #stream.parameter.named<"model"::"blk.30.ffn_gate.weight:d"> : tensor<f32> | |
util.global private @"__auto.blk.30.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.30.ffn_up.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.30.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.30.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.30.ffn_up.weight:d" = #stream.parameter.named<"model"::"blk.30.ffn_up.weight:d"> : tensor<f32> | |
util.global private @"__auto.blk.30.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.30.ffn_down.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.30.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.30.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ> | |
util.global private @"__auto.blk.30.ffn_down.weight:d" = #stream.parameter.named<"model"::"blk.30.ffn_down.weight:d"> : tensor<f32> | |
util.global private @__auto.blk.31.attn_norm.weight = #stream.parameter.named<"model"::"blk.31.attn_norm.weight"> : tensor<4096xbf16> | |
util.global private @"__auto.blk.31.attn_q.q_input:rscale" = #stream.parameter.named<"model"::"blk.31.attn_q.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.31.attn_q.weight:qs" = #stream.parameter.named<"model"::"blk.31.attn_q.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.31.attn_q.q_output:rscale" = #stream.parameter.named<"model"::"blk.31.attn_q.q_output:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.31.attn_k.q_input:rscale" = #stream.parameter.named<"model"::"blk.31.attn_k.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.31.attn_k.weight:qs" = #stream.parameter.named<"model"::"blk.31.attn_k.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.31.attn_k.q_output:rscale" = #stream.parameter.named<"model"::"blk.31.attn_k.q_output:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.31.attn_v.q_input:rscale" = #stream.parameter.named<"model"::"blk.31.attn_v.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.31.attn_v.weight:qs" = #stream.parameter.named<"model"::"blk.31.attn_v.weight:qs"> : tensor<1024x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.31.attn_v.q_output:rscale" = #stream.parameter.named<"model"::"blk.31.attn_v.q_output:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.31.attn_output.q_input:rscale" = #stream.parameter.named<"model"::"blk.31.attn_output.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.31.attn_output.weight:qs" = #stream.parameter.named<"model"::"blk.31.attn_output.weight:qs"> : tensor<4096x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.31.attn_output.weight:d" = #stream.parameter.named<"model"::"blk.31.attn_output.weight:d"> : tensor<f32> | |
util.global private @__auto.blk.31.ffn_norm.weight = #stream.parameter.named<"model"::"blk.31.ffn_norm.weight"> : tensor<4096xbf16> | |
util.global private @"__auto.blk.31.ffn_gate.q_input:rscale" = #stream.parameter.named<"model"::"blk.31.ffn_gate.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.31.ffn_gate.weight:qs" = #stream.parameter.named<"model"::"blk.31.ffn_gate.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.31.ffn_gate.weight:d" = #stream.parameter.named<"model"::"blk.31.ffn_gate.weight:d"> : tensor<f32> | |
util.global private @"__auto.blk.31.ffn_up.q_input:rscale" = #stream.parameter.named<"model"::"blk.31.ffn_up.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.31.ffn_up.weight:qs" = #stream.parameter.named<"model"::"blk.31.ffn_up.weight:qs"> : tensor<14336x4096xf8E4M3FNUZ> | |
util.global private @"__auto.blk.31.ffn_up.weight:d" = #stream.parameter.named<"model"::"blk.31.ffn_up.weight:d"> : tensor<f32> | |
util.global private @"__auto.blk.31.ffn_down.q_input:rscale" = #stream.parameter.named<"model"::"blk.31.ffn_down.q_input:rscale"> : tensor<f32> | |
util.global private @"__auto.blk.31.ffn_down.weight:qs" = #stream.parameter.named<"model"::"blk.31.ffn_down.weight:qs"> : tensor<4096x14336xf8E4M3FNUZ> | |
util.global private @"__auto.blk.31.ffn_down.weight:d" = #stream.parameter.named<"model"::"blk.31.ffn_down.weight:d"> : tensor<f32> | |
util.global private @__auto.output_norm.weight = #stream.parameter.named<"model"::"output_norm.weight"> : tensor<4096xbf16> | |
util.global private @__auto.output.weight = #stream.parameter.named<"model"::"output.weight"> : tensor<128256x4096xbf16> | |
func.func @prefill_bs4(%arg0: !torch.vtensor<[4,?],si64>, %arg1: !torch.vtensor<[4],si64>, %arg2: !torch.vtensor<[4,?],si64>, %arg3: !torch.tensor<[?,2097152],f8E4M3FNUZ>) -> !torch.vtensor<[4,?,128256],f32> attributes {torch.assume_strict_symbolic_shapes} { | |
%__auto.token_embd.weight = util.global.load @__auto.token_embd.weight : tensor<128256x4096xbf16> | |
%0 = torch_c.from_builtin_tensor %__auto.token_embd.weight : tensor<128256x4096xbf16> -> !torch.vtensor<[128256,4096],bf16> | |
%__auto.blk.0.attn_norm.weight = util.global.load @__auto.blk.0.attn_norm.weight : tensor<4096xbf16> | |
%1 = torch_c.from_builtin_tensor %__auto.blk.0.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16> | |
%__auto.blk.0.attn_q.q_input3Arscale = util.global.load @"__auto.blk.0.attn_q.q_input:rscale" : tensor<f32> | |
%2 = torch_c.from_builtin_tensor %__auto.blk.0.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.0.attn_q.weight3Aqs = util.global.load @"__auto.blk.0.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ> | |
%3 = torch_c.from_builtin_tensor %__auto.blk.0.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ> | |
%__auto.blk.0.attn_q.q_output3Arscale = util.global.load @"__auto.blk.0.attn_q.q_output:rscale" : tensor<f32> | |
%4 = torch_c.from_builtin_tensor %__auto.blk.0.attn_q.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.0.attn_k.q_input3Arscale = util.global.load @"__auto.blk.0.attn_k.q_input:rscale" : tensor<f32> | |
%5 = torch_c.from_builtin_tensor %__auto.blk.0.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.0.attn_k.weight3Aqs = util.global.load @"__auto.blk.0.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ> | |
%6 = torch_c.from_builtin_tensor %__auto.blk.0.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ> | |
%__auto.blk.0.attn_k.q_output3Arscale = util.global.load @"__auto.blk.0.attn_k.q_output:rscale" : tensor<f32> | |
%7 = torch_c.from_builtin_tensor %__auto.blk.0.attn_k.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.0.attn_v.q_input3Arscale = util.global.load @"__auto.blk.0.attn_v.q_input:rscale" : tensor<f32> | |
%8 = torch_c.from_builtin_tensor %__auto.blk.0.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.0.attn_v.weight3Aqs = util.global.load @"__auto.blk.0.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ> | |
%9 = torch_c.from_builtin_tensor %__auto.blk.0.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ> | |
%__auto.blk.0.attn_v.q_output3Arscale = util.global.load @"__auto.blk.0.attn_v.q_output:rscale" : tensor<f32> | |
%10 = torch_c.from_builtin_tensor %__auto.blk.0.attn_v.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%11 = torch.vtensor.literal(dense<0.0883883461> : tensor<f32>) : !torch.vtensor<[],f32> | |
%__auto.blk.0.attn_output.q_input3Arscale = util.global.load @"__auto.blk.0.attn_output.q_input:rscale" : tensor<f32> | |
%12 = torch_c.from_builtin_tensor %__auto.blk.0.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.0.attn_output.weight3Aqs = util.global.load @"__auto.blk.0.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ> | |
%13 = torch_c.from_builtin_tensor %__auto.blk.0.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ> | |
%__auto.blk.0.attn_output.weight3Ad = util.global.load @"__auto.blk.0.attn_output.weight:d" : tensor<f32> | |
%14 = torch_c.from_builtin_tensor %__auto.blk.0.attn_output.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.0.ffn_norm.weight = util.global.load @__auto.blk.0.ffn_norm.weight : tensor<4096xbf16> | |
%15 = torch_c.from_builtin_tensor %__auto.blk.0.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16> | |
%__auto.blk.0.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.0.ffn_gate.q_input:rscale" : tensor<f32> | |
%16 = torch_c.from_builtin_tensor %__auto.blk.0.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.0.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.0.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ> | |
%17 = torch_c.from_builtin_tensor %__auto.blk.0.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ> | |
%__auto.blk.0.ffn_gate.weight3Ad = util.global.load @"__auto.blk.0.ffn_gate.weight:d" : tensor<f32> | |
%18 = torch_c.from_builtin_tensor %__auto.blk.0.ffn_gate.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.0.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.0.ffn_up.q_input:rscale" : tensor<f32> | |
%19 = torch_c.from_builtin_tensor %__auto.blk.0.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.0.ffn_up.weight3Aqs = util.global.load @"__auto.blk.0.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ> | |
%20 = torch_c.from_builtin_tensor %__auto.blk.0.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ> | |
%__auto.blk.0.ffn_up.weight3Ad = util.global.load @"__auto.blk.0.ffn_up.weight:d" : tensor<f32> | |
%21 = torch_c.from_builtin_tensor %__auto.blk.0.ffn_up.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.0.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.0.ffn_down.q_input:rscale" : tensor<f32> | |
%22 = torch_c.from_builtin_tensor %__auto.blk.0.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.0.ffn_down.weight3Aqs = util.global.load @"__auto.blk.0.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ> | |
%23 = torch_c.from_builtin_tensor %__auto.blk.0.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ> | |
%__auto.blk.0.ffn_down.weight3Ad = util.global.load @"__auto.blk.0.ffn_down.weight:d" : tensor<f32> | |
%24 = torch_c.from_builtin_tensor %__auto.blk.0.ffn_down.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.1.attn_norm.weight = util.global.load @__auto.blk.1.attn_norm.weight : tensor<4096xbf16> | |
%25 = torch_c.from_builtin_tensor %__auto.blk.1.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16> | |
%__auto.blk.1.attn_q.q_input3Arscale = util.global.load @"__auto.blk.1.attn_q.q_input:rscale" : tensor<f32> | |
%26 = torch_c.from_builtin_tensor %__auto.blk.1.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.1.attn_q.weight3Aqs = util.global.load @"__auto.blk.1.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ> | |
%27 = torch_c.from_builtin_tensor %__auto.blk.1.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ> | |
%__auto.blk.1.attn_q.q_output3Arscale = util.global.load @"__auto.blk.1.attn_q.q_output:rscale" : tensor<f32> | |
%28 = torch_c.from_builtin_tensor %__auto.blk.1.attn_q.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.1.attn_k.q_input3Arscale = util.global.load @"__auto.blk.1.attn_k.q_input:rscale" : tensor<f32> | |
%29 = torch_c.from_builtin_tensor %__auto.blk.1.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.1.attn_k.weight3Aqs = util.global.load @"__auto.blk.1.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ> | |
%30 = torch_c.from_builtin_tensor %__auto.blk.1.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ> | |
%__auto.blk.1.attn_k.q_output3Arscale = util.global.load @"__auto.blk.1.attn_k.q_output:rscale" : tensor<f32> | |
%31 = torch_c.from_builtin_tensor %__auto.blk.1.attn_k.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.1.attn_v.q_input3Arscale = util.global.load @"__auto.blk.1.attn_v.q_input:rscale" : tensor<f32> | |
%32 = torch_c.from_builtin_tensor %__auto.blk.1.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.1.attn_v.weight3Aqs = util.global.load @"__auto.blk.1.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ> | |
%33 = torch_c.from_builtin_tensor %__auto.blk.1.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ> | |
%__auto.blk.1.attn_v.q_output3Arscale = util.global.load @"__auto.blk.1.attn_v.q_output:rscale" : tensor<f32> | |
%34 = torch_c.from_builtin_tensor %__auto.blk.1.attn_v.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%35 = torch.vtensor.literal(dense<0.0883883461> : tensor<f32>) : !torch.vtensor<[],f32> | |
%__auto.blk.1.attn_output.q_input3Arscale = util.global.load @"__auto.blk.1.attn_output.q_input:rscale" : tensor<f32> | |
%36 = torch_c.from_builtin_tensor %__auto.blk.1.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.1.attn_output.weight3Aqs = util.global.load @"__auto.blk.1.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ> | |
%37 = torch_c.from_builtin_tensor %__auto.blk.1.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ> | |
%__auto.blk.1.attn_output.weight3Ad = util.global.load @"__auto.blk.1.attn_output.weight:d" : tensor<f32> | |
%38 = torch_c.from_builtin_tensor %__auto.blk.1.attn_output.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.1.ffn_norm.weight = util.global.load @__auto.blk.1.ffn_norm.weight : tensor<4096xbf16> | |
%39 = torch_c.from_builtin_tensor %__auto.blk.1.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16> | |
%__auto.blk.1.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.1.ffn_gate.q_input:rscale" : tensor<f32> | |
%40 = torch_c.from_builtin_tensor %__auto.blk.1.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.1.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.1.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ> | |
%41 = torch_c.from_builtin_tensor %__auto.blk.1.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ> | |
%__auto.blk.1.ffn_gate.weight3Ad = util.global.load @"__auto.blk.1.ffn_gate.weight:d" : tensor<f32> | |
%42 = torch_c.from_builtin_tensor %__auto.blk.1.ffn_gate.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.1.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.1.ffn_up.q_input:rscale" : tensor<f32> | |
%43 = torch_c.from_builtin_tensor %__auto.blk.1.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.1.ffn_up.weight3Aqs = util.global.load @"__auto.blk.1.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ> | |
%44 = torch_c.from_builtin_tensor %__auto.blk.1.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ> | |
%__auto.blk.1.ffn_up.weight3Ad = util.global.load @"__auto.blk.1.ffn_up.weight:d" : tensor<f32> | |
%45 = torch_c.from_builtin_tensor %__auto.blk.1.ffn_up.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.1.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.1.ffn_down.q_input:rscale" : tensor<f32> | |
%46 = torch_c.from_builtin_tensor %__auto.blk.1.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.1.ffn_down.weight3Aqs = util.global.load @"__auto.blk.1.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ> | |
%47 = torch_c.from_builtin_tensor %__auto.blk.1.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ> | |
%__auto.blk.1.ffn_down.weight3Ad = util.global.load @"__auto.blk.1.ffn_down.weight:d" : tensor<f32> | |
%48 = torch_c.from_builtin_tensor %__auto.blk.1.ffn_down.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.2.attn_norm.weight = util.global.load @__auto.blk.2.attn_norm.weight : tensor<4096xbf16> | |
%49 = torch_c.from_builtin_tensor %__auto.blk.2.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16> | |
%__auto.blk.2.attn_q.q_input3Arscale = util.global.load @"__auto.blk.2.attn_q.q_input:rscale" : tensor<f32> | |
%50 = torch_c.from_builtin_tensor %__auto.blk.2.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.2.attn_q.weight3Aqs = util.global.load @"__auto.blk.2.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ> | |
%51 = torch_c.from_builtin_tensor %__auto.blk.2.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ> | |
%__auto.blk.2.attn_q.q_output3Arscale = util.global.load @"__auto.blk.2.attn_q.q_output:rscale" : tensor<f32> | |
%52 = torch_c.from_builtin_tensor %__auto.blk.2.attn_q.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.2.attn_k.q_input3Arscale = util.global.load @"__auto.blk.2.attn_k.q_input:rscale" : tensor<f32> | |
%53 = torch_c.from_builtin_tensor %__auto.blk.2.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.2.attn_k.weight3Aqs = util.global.load @"__auto.blk.2.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ> | |
%54 = torch_c.from_builtin_tensor %__auto.blk.2.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ> | |
%__auto.blk.2.attn_k.q_output3Arscale = util.global.load @"__auto.blk.2.attn_k.q_output:rscale" : tensor<f32> | |
%55 = torch_c.from_builtin_tensor %__auto.blk.2.attn_k.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.2.attn_v.q_input3Arscale = util.global.load @"__auto.blk.2.attn_v.q_input:rscale" : tensor<f32> | |
%56 = torch_c.from_builtin_tensor %__auto.blk.2.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.2.attn_v.weight3Aqs = util.global.load @"__auto.blk.2.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ> | |
%57 = torch_c.from_builtin_tensor %__auto.blk.2.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ> | |
%__auto.blk.2.attn_v.q_output3Arscale = util.global.load @"__auto.blk.2.attn_v.q_output:rscale" : tensor<f32> | |
%58 = torch_c.from_builtin_tensor %__auto.blk.2.attn_v.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%59 = torch.vtensor.literal(dense<0.0883883461> : tensor<f32>) : !torch.vtensor<[],f32> | |
%__auto.blk.2.attn_output.q_input3Arscale = util.global.load @"__auto.blk.2.attn_output.q_input:rscale" : tensor<f32> | |
%60 = torch_c.from_builtin_tensor %__auto.blk.2.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.2.attn_output.weight3Aqs = util.global.load @"__auto.blk.2.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ> | |
%61 = torch_c.from_builtin_tensor %__auto.blk.2.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ> | |
%__auto.blk.2.attn_output.weight3Ad = util.global.load @"__auto.blk.2.attn_output.weight:d" : tensor<f32> | |
%62 = torch_c.from_builtin_tensor %__auto.blk.2.attn_output.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.2.ffn_norm.weight = util.global.load @__auto.blk.2.ffn_norm.weight : tensor<4096xbf16> | |
%63 = torch_c.from_builtin_tensor %__auto.blk.2.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16> | |
%__auto.blk.2.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.2.ffn_gate.q_input:rscale" : tensor<f32> | |
%64 = torch_c.from_builtin_tensor %__auto.blk.2.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.2.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.2.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ> | |
%65 = torch_c.from_builtin_tensor %__auto.blk.2.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ> | |
%__auto.blk.2.ffn_gate.weight3Ad = util.global.load @"__auto.blk.2.ffn_gate.weight:d" : tensor<f32> | |
%66 = torch_c.from_builtin_tensor %__auto.blk.2.ffn_gate.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.2.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.2.ffn_up.q_input:rscale" : tensor<f32> | |
%67 = torch_c.from_builtin_tensor %__auto.blk.2.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.2.ffn_up.weight3Aqs = util.global.load @"__auto.blk.2.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ> | |
%68 = torch_c.from_builtin_tensor %__auto.blk.2.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ> | |
%__auto.blk.2.ffn_up.weight3Ad = util.global.load @"__auto.blk.2.ffn_up.weight:d" : tensor<f32> | |
%69 = torch_c.from_builtin_tensor %__auto.blk.2.ffn_up.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.2.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.2.ffn_down.q_input:rscale" : tensor<f32> | |
%70 = torch_c.from_builtin_tensor %__auto.blk.2.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.2.ffn_down.weight3Aqs = util.global.load @"__auto.blk.2.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ> | |
%71 = torch_c.from_builtin_tensor %__auto.blk.2.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ> | |
%__auto.blk.2.ffn_down.weight3Ad = util.global.load @"__auto.blk.2.ffn_down.weight:d" : tensor<f32> | |
%72 = torch_c.from_builtin_tensor %__auto.blk.2.ffn_down.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.3.attn_norm.weight = util.global.load @__auto.blk.3.attn_norm.weight : tensor<4096xbf16> | |
%73 = torch_c.from_builtin_tensor %__auto.blk.3.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16> | |
%__auto.blk.3.attn_q.q_input3Arscale = util.global.load @"__auto.blk.3.attn_q.q_input:rscale" : tensor<f32> | |
%74 = torch_c.from_builtin_tensor %__auto.blk.3.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.3.attn_q.weight3Aqs = util.global.load @"__auto.blk.3.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ> | |
%75 = torch_c.from_builtin_tensor %__auto.blk.3.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ> | |
%__auto.blk.3.attn_q.q_output3Arscale = util.global.load @"__auto.blk.3.attn_q.q_output:rscale" : tensor<f32> | |
%76 = torch_c.from_builtin_tensor %__auto.blk.3.attn_q.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.3.attn_k.q_input3Arscale = util.global.load @"__auto.blk.3.attn_k.q_input:rscale" : tensor<f32> | |
%77 = torch_c.from_builtin_tensor %__auto.blk.3.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.3.attn_k.weight3Aqs = util.global.load @"__auto.blk.3.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ> | |
%78 = torch_c.from_builtin_tensor %__auto.blk.3.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ> | |
%__auto.blk.3.attn_k.q_output3Arscale = util.global.load @"__auto.blk.3.attn_k.q_output:rscale" : tensor<f32> | |
%79 = torch_c.from_builtin_tensor %__auto.blk.3.attn_k.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.3.attn_v.q_input3Arscale = util.global.load @"__auto.blk.3.attn_v.q_input:rscale" : tensor<f32> | |
%80 = torch_c.from_builtin_tensor %__auto.blk.3.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.3.attn_v.weight3Aqs = util.global.load @"__auto.blk.3.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ> | |
%81 = torch_c.from_builtin_tensor %__auto.blk.3.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ> | |
%__auto.blk.3.attn_v.q_output3Arscale = util.global.load @"__auto.blk.3.attn_v.q_output:rscale" : tensor<f32> | |
%82 = torch_c.from_builtin_tensor %__auto.blk.3.attn_v.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%83 = torch.vtensor.literal(dense<0.0883883461> : tensor<f32>) : !torch.vtensor<[],f32> | |
%__auto.blk.3.attn_output.q_input3Arscale = util.global.load @"__auto.blk.3.attn_output.q_input:rscale" : tensor<f32> | |
%84 = torch_c.from_builtin_tensor %__auto.blk.3.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.3.attn_output.weight3Aqs = util.global.load @"__auto.blk.3.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ> | |
%85 = torch_c.from_builtin_tensor %__auto.blk.3.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ> | |
%__auto.blk.3.attn_output.weight3Ad = util.global.load @"__auto.blk.3.attn_output.weight:d" : tensor<f32> | |
%86 = torch_c.from_builtin_tensor %__auto.blk.3.attn_output.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.3.ffn_norm.weight = util.global.load @__auto.blk.3.ffn_norm.weight : tensor<4096xbf16> | |
%87 = torch_c.from_builtin_tensor %__auto.blk.3.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16> | |
%__auto.blk.3.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.3.ffn_gate.q_input:rscale" : tensor<f32> | |
%88 = torch_c.from_builtin_tensor %__auto.blk.3.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.3.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.3.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ> | |
%89 = torch_c.from_builtin_tensor %__auto.blk.3.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ> | |
%__auto.blk.3.ffn_gate.weight3Ad = util.global.load @"__auto.blk.3.ffn_gate.weight:d" : tensor<f32> | |
%90 = torch_c.from_builtin_tensor %__auto.blk.3.ffn_gate.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.3.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.3.ffn_up.q_input:rscale" : tensor<f32> | |
%91 = torch_c.from_builtin_tensor %__auto.blk.3.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.3.ffn_up.weight3Aqs = util.global.load @"__auto.blk.3.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ> | |
%92 = torch_c.from_builtin_tensor %__auto.blk.3.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ> | |
%__auto.blk.3.ffn_up.weight3Ad = util.global.load @"__auto.blk.3.ffn_up.weight:d" : tensor<f32> | |
%93 = torch_c.from_builtin_tensor %__auto.blk.3.ffn_up.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.3.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.3.ffn_down.q_input:rscale" : tensor<f32> | |
%94 = torch_c.from_builtin_tensor %__auto.blk.3.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.3.ffn_down.weight3Aqs = util.global.load @"__auto.blk.3.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ> | |
%95 = torch_c.from_builtin_tensor %__auto.blk.3.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ> | |
%__auto.blk.3.ffn_down.weight3Ad = util.global.load @"__auto.blk.3.ffn_down.weight:d" : tensor<f32> | |
%96 = torch_c.from_builtin_tensor %__auto.blk.3.ffn_down.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.4.attn_norm.weight = util.global.load @__auto.blk.4.attn_norm.weight : tensor<4096xbf16> | |
%97 = torch_c.from_builtin_tensor %__auto.blk.4.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16> | |
%__auto.blk.4.attn_q.q_input3Arscale = util.global.load @"__auto.blk.4.attn_q.q_input:rscale" : tensor<f32> | |
%98 = torch_c.from_builtin_tensor %__auto.blk.4.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.4.attn_q.weight3Aqs = util.global.load @"__auto.blk.4.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ> | |
%99 = torch_c.from_builtin_tensor %__auto.blk.4.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ> | |
%__auto.blk.4.attn_q.q_output3Arscale = util.global.load @"__auto.blk.4.attn_q.q_output:rscale" : tensor<f32> | |
%100 = torch_c.from_builtin_tensor %__auto.blk.4.attn_q.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.4.attn_k.q_input3Arscale = util.global.load @"__auto.blk.4.attn_k.q_input:rscale" : tensor<f32> | |
%101 = torch_c.from_builtin_tensor %__auto.blk.4.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.4.attn_k.weight3Aqs = util.global.load @"__auto.blk.4.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ> | |
%102 = torch_c.from_builtin_tensor %__auto.blk.4.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ> | |
%__auto.blk.4.attn_k.q_output3Arscale = util.global.load @"__auto.blk.4.attn_k.q_output:rscale" : tensor<f32> | |
%103 = torch_c.from_builtin_tensor %__auto.blk.4.attn_k.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.4.attn_v.q_input3Arscale = util.global.load @"__auto.blk.4.attn_v.q_input:rscale" : tensor<f32> | |
%104 = torch_c.from_builtin_tensor %__auto.blk.4.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.4.attn_v.weight3Aqs = util.global.load @"__auto.blk.4.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ> | |
%105 = torch_c.from_builtin_tensor %__auto.blk.4.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ> | |
%__auto.blk.4.attn_v.q_output3Arscale = util.global.load @"__auto.blk.4.attn_v.q_output:rscale" : tensor<f32> | |
%106 = torch_c.from_builtin_tensor %__auto.blk.4.attn_v.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%107 = torch.vtensor.literal(dense<0.0883883461> : tensor<f32>) : !torch.vtensor<[],f32> | |
%__auto.blk.4.attn_output.q_input3Arscale = util.global.load @"__auto.blk.4.attn_output.q_input:rscale" : tensor<f32> | |
%108 = torch_c.from_builtin_tensor %__auto.blk.4.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.4.attn_output.weight3Aqs = util.global.load @"__auto.blk.4.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ> | |
%109 = torch_c.from_builtin_tensor %__auto.blk.4.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ> | |
%__auto.blk.4.attn_output.weight3Ad = util.global.load @"__auto.blk.4.attn_output.weight:d" : tensor<f32> | |
%110 = torch_c.from_builtin_tensor %__auto.blk.4.attn_output.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.4.ffn_norm.weight = util.global.load @__auto.blk.4.ffn_norm.weight : tensor<4096xbf16> | |
%111 = torch_c.from_builtin_tensor %__auto.blk.4.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16> | |
%__auto.blk.4.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.4.ffn_gate.q_input:rscale" : tensor<f32> | |
%112 = torch_c.from_builtin_tensor %__auto.blk.4.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.4.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.4.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ> | |
%113 = torch_c.from_builtin_tensor %__auto.blk.4.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ> | |
%__auto.blk.4.ffn_gate.weight3Ad = util.global.load @"__auto.blk.4.ffn_gate.weight:d" : tensor<f32> | |
%114 = torch_c.from_builtin_tensor %__auto.blk.4.ffn_gate.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.4.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.4.ffn_up.q_input:rscale" : tensor<f32> | |
%115 = torch_c.from_builtin_tensor %__auto.blk.4.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.4.ffn_up.weight3Aqs = util.global.load @"__auto.blk.4.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ> | |
%116 = torch_c.from_builtin_tensor %__auto.blk.4.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ> | |
%__auto.blk.4.ffn_up.weight3Ad = util.global.load @"__auto.blk.4.ffn_up.weight:d" : tensor<f32> | |
%117 = torch_c.from_builtin_tensor %__auto.blk.4.ffn_up.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.4.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.4.ffn_down.q_input:rscale" : tensor<f32> | |
%118 = torch_c.from_builtin_tensor %__auto.blk.4.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.4.ffn_down.weight3Aqs = util.global.load @"__auto.blk.4.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ> | |
%119 = torch_c.from_builtin_tensor %__auto.blk.4.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ> | |
%__auto.blk.4.ffn_down.weight3Ad = util.global.load @"__auto.blk.4.ffn_down.weight:d" : tensor<f32> | |
%120 = torch_c.from_builtin_tensor %__auto.blk.4.ffn_down.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.5.attn_norm.weight = util.global.load @__auto.blk.5.attn_norm.weight : tensor<4096xbf16> | |
%121 = torch_c.from_builtin_tensor %__auto.blk.5.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16> | |
%__auto.blk.5.attn_q.q_input3Arscale = util.global.load @"__auto.blk.5.attn_q.q_input:rscale" : tensor<f32> | |
%122 = torch_c.from_builtin_tensor %__auto.blk.5.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.5.attn_q.weight3Aqs = util.global.load @"__auto.blk.5.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ> | |
%123 = torch_c.from_builtin_tensor %__auto.blk.5.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ> | |
%__auto.blk.5.attn_q.q_output3Arscale = util.global.load @"__auto.blk.5.attn_q.q_output:rscale" : tensor<f32> | |
%124 = torch_c.from_builtin_tensor %__auto.blk.5.attn_q.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.5.attn_k.q_input3Arscale = util.global.load @"__auto.blk.5.attn_k.q_input:rscale" : tensor<f32> | |
%125 = torch_c.from_builtin_tensor %__auto.blk.5.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.5.attn_k.weight3Aqs = util.global.load @"__auto.blk.5.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ> | |
%126 = torch_c.from_builtin_tensor %__auto.blk.5.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ> | |
%__auto.blk.5.attn_k.q_output3Arscale = util.global.load @"__auto.blk.5.attn_k.q_output:rscale" : tensor<f32> | |
%127 = torch_c.from_builtin_tensor %__auto.blk.5.attn_k.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.5.attn_v.q_input3Arscale = util.global.load @"__auto.blk.5.attn_v.q_input:rscale" : tensor<f32> | |
%128 = torch_c.from_builtin_tensor %__auto.blk.5.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.5.attn_v.weight3Aqs = util.global.load @"__auto.blk.5.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ> | |
%129 = torch_c.from_builtin_tensor %__auto.blk.5.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ> | |
%__auto.blk.5.attn_v.q_output3Arscale = util.global.load @"__auto.blk.5.attn_v.q_output:rscale" : tensor<f32> | |
%130 = torch_c.from_builtin_tensor %__auto.blk.5.attn_v.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%131 = torch.vtensor.literal(dense<0.0883883461> : tensor<f32>) : !torch.vtensor<[],f32> | |
%__auto.blk.5.attn_output.q_input3Arscale = util.global.load @"__auto.blk.5.attn_output.q_input:rscale" : tensor<f32> | |
%132 = torch_c.from_builtin_tensor %__auto.blk.5.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.5.attn_output.weight3Aqs = util.global.load @"__auto.blk.5.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ> | |
%133 = torch_c.from_builtin_tensor %__auto.blk.5.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ> | |
%__auto.blk.5.attn_output.weight3Ad = util.global.load @"__auto.blk.5.attn_output.weight:d" : tensor<f32> | |
%134 = torch_c.from_builtin_tensor %__auto.blk.5.attn_output.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.5.ffn_norm.weight = util.global.load @__auto.blk.5.ffn_norm.weight : tensor<4096xbf16> | |
%135 = torch_c.from_builtin_tensor %__auto.blk.5.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16> | |
%__auto.blk.5.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.5.ffn_gate.q_input:rscale" : tensor<f32> | |
%136 = torch_c.from_builtin_tensor %__auto.blk.5.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.5.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.5.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ> | |
%137 = torch_c.from_builtin_tensor %__auto.blk.5.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ> | |
%__auto.blk.5.ffn_gate.weight3Ad = util.global.load @"__auto.blk.5.ffn_gate.weight:d" : tensor<f32> | |
%138 = torch_c.from_builtin_tensor %__auto.blk.5.ffn_gate.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.5.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.5.ffn_up.q_input:rscale" : tensor<f32> | |
%139 = torch_c.from_builtin_tensor %__auto.blk.5.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.5.ffn_up.weight3Aqs = util.global.load @"__auto.blk.5.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ> | |
%140 = torch_c.from_builtin_tensor %__auto.blk.5.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ> | |
%__auto.blk.5.ffn_up.weight3Ad = util.global.load @"__auto.blk.5.ffn_up.weight:d" : tensor<f32> | |
%141 = torch_c.from_builtin_tensor %__auto.blk.5.ffn_up.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.5.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.5.ffn_down.q_input:rscale" : tensor<f32> | |
%142 = torch_c.from_builtin_tensor %__auto.blk.5.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.5.ffn_down.weight3Aqs = util.global.load @"__auto.blk.5.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ> | |
%143 = torch_c.from_builtin_tensor %__auto.blk.5.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ> | |
%__auto.blk.5.ffn_down.weight3Ad = util.global.load @"__auto.blk.5.ffn_down.weight:d" : tensor<f32> | |
%144 = torch_c.from_builtin_tensor %__auto.blk.5.ffn_down.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.6.attn_norm.weight = util.global.load @__auto.blk.6.attn_norm.weight : tensor<4096xbf16> | |
%145 = torch_c.from_builtin_tensor %__auto.blk.6.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16> | |
%__auto.blk.6.attn_q.q_input3Arscale = util.global.load @"__auto.blk.6.attn_q.q_input:rscale" : tensor<f32> | |
%146 = torch_c.from_builtin_tensor %__auto.blk.6.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.6.attn_q.weight3Aqs = util.global.load @"__auto.blk.6.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ> | |
%147 = torch_c.from_builtin_tensor %__auto.blk.6.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ> | |
%__auto.blk.6.attn_q.q_output3Arscale = util.global.load @"__auto.blk.6.attn_q.q_output:rscale" : tensor<f32> | |
%148 = torch_c.from_builtin_tensor %__auto.blk.6.attn_q.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.6.attn_k.q_input3Arscale = util.global.load @"__auto.blk.6.attn_k.q_input:rscale" : tensor<f32> | |
%149 = torch_c.from_builtin_tensor %__auto.blk.6.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.6.attn_k.weight3Aqs = util.global.load @"__auto.blk.6.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ> | |
%150 = torch_c.from_builtin_tensor %__auto.blk.6.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ> | |
%__auto.blk.6.attn_k.q_output3Arscale = util.global.load @"__auto.blk.6.attn_k.q_output:rscale" : tensor<f32> | |
%151 = torch_c.from_builtin_tensor %__auto.blk.6.attn_k.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.6.attn_v.q_input3Arscale = util.global.load @"__auto.blk.6.attn_v.q_input:rscale" : tensor<f32> | |
%152 = torch_c.from_builtin_tensor %__auto.blk.6.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.6.attn_v.weight3Aqs = util.global.load @"__auto.blk.6.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ> | |
%153 = torch_c.from_builtin_tensor %__auto.blk.6.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ> | |
%__auto.blk.6.attn_v.q_output3Arscale = util.global.load @"__auto.blk.6.attn_v.q_output:rscale" : tensor<f32> | |
%154 = torch_c.from_builtin_tensor %__auto.blk.6.attn_v.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%155 = torch.vtensor.literal(dense<0.0883883461> : tensor<f32>) : !torch.vtensor<[],f32> | |
%__auto.blk.6.attn_output.q_input3Arscale = util.global.load @"__auto.blk.6.attn_output.q_input:rscale" : tensor<f32> | |
%156 = torch_c.from_builtin_tensor %__auto.blk.6.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.6.attn_output.weight3Aqs = util.global.load @"__auto.blk.6.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ> | |
%157 = torch_c.from_builtin_tensor %__auto.blk.6.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ> | |
%__auto.blk.6.attn_output.weight3Ad = util.global.load @"__auto.blk.6.attn_output.weight:d" : tensor<f32> | |
%158 = torch_c.from_builtin_tensor %__auto.blk.6.attn_output.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.6.ffn_norm.weight = util.global.load @__auto.blk.6.ffn_norm.weight : tensor<4096xbf16> | |
%159 = torch_c.from_builtin_tensor %__auto.blk.6.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16> | |
%__auto.blk.6.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.6.ffn_gate.q_input:rscale" : tensor<f32> | |
%160 = torch_c.from_builtin_tensor %__auto.blk.6.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.6.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.6.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ> | |
%161 = torch_c.from_builtin_tensor %__auto.blk.6.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ> | |
%__auto.blk.6.ffn_gate.weight3Ad = util.global.load @"__auto.blk.6.ffn_gate.weight:d" : tensor<f32> | |
%162 = torch_c.from_builtin_tensor %__auto.blk.6.ffn_gate.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.6.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.6.ffn_up.q_input:rscale" : tensor<f32> | |
%163 = torch_c.from_builtin_tensor %__auto.blk.6.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.6.ffn_up.weight3Aqs = util.global.load @"__auto.blk.6.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ> | |
%164 = torch_c.from_builtin_tensor %__auto.blk.6.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ> | |
%__auto.blk.6.ffn_up.weight3Ad = util.global.load @"__auto.blk.6.ffn_up.weight:d" : tensor<f32> | |
%165 = torch_c.from_builtin_tensor %__auto.blk.6.ffn_up.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.6.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.6.ffn_down.q_input:rscale" : tensor<f32> | |
%166 = torch_c.from_builtin_tensor %__auto.blk.6.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.6.ffn_down.weight3Aqs = util.global.load @"__auto.blk.6.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ> | |
%167 = torch_c.from_builtin_tensor %__auto.blk.6.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ> | |
%__auto.blk.6.ffn_down.weight3Ad = util.global.load @"__auto.blk.6.ffn_down.weight:d" : tensor<f32> | |
%168 = torch_c.from_builtin_tensor %__auto.blk.6.ffn_down.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.7.attn_norm.weight = util.global.load @__auto.blk.7.attn_norm.weight : tensor<4096xbf16> | |
%169 = torch_c.from_builtin_tensor %__auto.blk.7.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16> | |
%__auto.blk.7.attn_q.q_input3Arscale = util.global.load @"__auto.blk.7.attn_q.q_input:rscale" : tensor<f32> | |
%170 = torch_c.from_builtin_tensor %__auto.blk.7.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.7.attn_q.weight3Aqs = util.global.load @"__auto.blk.7.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ> | |
%171 = torch_c.from_builtin_tensor %__auto.blk.7.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ> | |
%__auto.blk.7.attn_q.q_output3Arscale = util.global.load @"__auto.blk.7.attn_q.q_output:rscale" : tensor<f32> | |
%172 = torch_c.from_builtin_tensor %__auto.blk.7.attn_q.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.7.attn_k.q_input3Arscale = util.global.load @"__auto.blk.7.attn_k.q_input:rscale" : tensor<f32> | |
%173 = torch_c.from_builtin_tensor %__auto.blk.7.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.7.attn_k.weight3Aqs = util.global.load @"__auto.blk.7.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ> | |
%174 = torch_c.from_builtin_tensor %__auto.blk.7.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ> | |
%__auto.blk.7.attn_k.q_output3Arscale = util.global.load @"__auto.blk.7.attn_k.q_output:rscale" : tensor<f32> | |
%175 = torch_c.from_builtin_tensor %__auto.blk.7.attn_k.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.7.attn_v.q_input3Arscale = util.global.load @"__auto.blk.7.attn_v.q_input:rscale" : tensor<f32> | |
%176 = torch_c.from_builtin_tensor %__auto.blk.7.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.7.attn_v.weight3Aqs = util.global.load @"__auto.blk.7.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ> | |
%177 = torch_c.from_builtin_tensor %__auto.blk.7.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ> | |
%__auto.blk.7.attn_v.q_output3Arscale = util.global.load @"__auto.blk.7.attn_v.q_output:rscale" : tensor<f32> | |
%178 = torch_c.from_builtin_tensor %__auto.blk.7.attn_v.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%179 = torch.vtensor.literal(dense<0.0883883461> : tensor<f32>) : !torch.vtensor<[],f32> | |
%__auto.blk.7.attn_output.q_input3Arscale = util.global.load @"__auto.blk.7.attn_output.q_input:rscale" : tensor<f32> | |
%180 = torch_c.from_builtin_tensor %__auto.blk.7.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.7.attn_output.weight3Aqs = util.global.load @"__auto.blk.7.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ> | |
%181 = torch_c.from_builtin_tensor %__auto.blk.7.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ> | |
%__auto.blk.7.attn_output.weight3Ad = util.global.load @"__auto.blk.7.attn_output.weight:d" : tensor<f32> | |
%182 = torch_c.from_builtin_tensor %__auto.blk.7.attn_output.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.7.ffn_norm.weight = util.global.load @__auto.blk.7.ffn_norm.weight : tensor<4096xbf16> | |
%183 = torch_c.from_builtin_tensor %__auto.blk.7.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16> | |
%__auto.blk.7.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.7.ffn_gate.q_input:rscale" : tensor<f32> | |
%184 = torch_c.from_builtin_tensor %__auto.blk.7.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.7.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.7.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ> | |
%185 = torch_c.from_builtin_tensor %__auto.blk.7.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ> | |
%__auto.blk.7.ffn_gate.weight3Ad = util.global.load @"__auto.blk.7.ffn_gate.weight:d" : tensor<f32> | |
%186 = torch_c.from_builtin_tensor %__auto.blk.7.ffn_gate.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.7.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.7.ffn_up.q_input:rscale" : tensor<f32> | |
%187 = torch_c.from_builtin_tensor %__auto.blk.7.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.7.ffn_up.weight3Aqs = util.global.load @"__auto.blk.7.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ> | |
%188 = torch_c.from_builtin_tensor %__auto.blk.7.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ> | |
%__auto.blk.7.ffn_up.weight3Ad = util.global.load @"__auto.blk.7.ffn_up.weight:d" : tensor<f32> | |
%189 = torch_c.from_builtin_tensor %__auto.blk.7.ffn_up.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.7.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.7.ffn_down.q_input:rscale" : tensor<f32> | |
%190 = torch_c.from_builtin_tensor %__auto.blk.7.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.7.ffn_down.weight3Aqs = util.global.load @"__auto.blk.7.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ> | |
%191 = torch_c.from_builtin_tensor %__auto.blk.7.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ> | |
%__auto.blk.7.ffn_down.weight3Ad = util.global.load @"__auto.blk.7.ffn_down.weight:d" : tensor<f32> | |
%192 = torch_c.from_builtin_tensor %__auto.blk.7.ffn_down.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.8.attn_norm.weight = util.global.load @__auto.blk.8.attn_norm.weight : tensor<4096xbf16> | |
%193 = torch_c.from_builtin_tensor %__auto.blk.8.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16> | |
%__auto.blk.8.attn_q.q_input3Arscale = util.global.load @"__auto.blk.8.attn_q.q_input:rscale" : tensor<f32> | |
%194 = torch_c.from_builtin_tensor %__auto.blk.8.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.8.attn_q.weight3Aqs = util.global.load @"__auto.blk.8.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ> | |
%195 = torch_c.from_builtin_tensor %__auto.blk.8.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ> | |
%__auto.blk.8.attn_q.q_output3Arscale = util.global.load @"__auto.blk.8.attn_q.q_output:rscale" : tensor<f32> | |
%196 = torch_c.from_builtin_tensor %__auto.blk.8.attn_q.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.8.attn_k.q_input3Arscale = util.global.load @"__auto.blk.8.attn_k.q_input:rscale" : tensor<f32> | |
%197 = torch_c.from_builtin_tensor %__auto.blk.8.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.8.attn_k.weight3Aqs = util.global.load @"__auto.blk.8.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ> | |
%198 = torch_c.from_builtin_tensor %__auto.blk.8.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ> | |
%__auto.blk.8.attn_k.q_output3Arscale = util.global.load @"__auto.blk.8.attn_k.q_output:rscale" : tensor<f32> | |
%199 = torch_c.from_builtin_tensor %__auto.blk.8.attn_k.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.8.attn_v.q_input3Arscale = util.global.load @"__auto.blk.8.attn_v.q_input:rscale" : tensor<f32> | |
%200 = torch_c.from_builtin_tensor %__auto.blk.8.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.8.attn_v.weight3Aqs = util.global.load @"__auto.blk.8.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ> | |
%201 = torch_c.from_builtin_tensor %__auto.blk.8.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ> | |
%__auto.blk.8.attn_v.q_output3Arscale = util.global.load @"__auto.blk.8.attn_v.q_output:rscale" : tensor<f32> | |
%202 = torch_c.from_builtin_tensor %__auto.blk.8.attn_v.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%203 = torch.vtensor.literal(dense<0.0883883461> : tensor<f32>) : !torch.vtensor<[],f32> | |
%__auto.blk.8.attn_output.q_input3Arscale = util.global.load @"__auto.blk.8.attn_output.q_input:rscale" : tensor<f32> | |
%204 = torch_c.from_builtin_tensor %__auto.blk.8.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.8.attn_output.weight3Aqs = util.global.load @"__auto.blk.8.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ> | |
%205 = torch_c.from_builtin_tensor %__auto.blk.8.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ> | |
%__auto.blk.8.attn_output.weight3Ad = util.global.load @"__auto.blk.8.attn_output.weight:d" : tensor<f32> | |
%206 = torch_c.from_builtin_tensor %__auto.blk.8.attn_output.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.8.ffn_norm.weight = util.global.load @__auto.blk.8.ffn_norm.weight : tensor<4096xbf16> | |
%207 = torch_c.from_builtin_tensor %__auto.blk.8.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16> | |
%__auto.blk.8.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.8.ffn_gate.q_input:rscale" : tensor<f32> | |
%208 = torch_c.from_builtin_tensor %__auto.blk.8.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.8.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.8.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ> | |
%209 = torch_c.from_builtin_tensor %__auto.blk.8.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ> | |
%__auto.blk.8.ffn_gate.weight3Ad = util.global.load @"__auto.blk.8.ffn_gate.weight:d" : tensor<f32> | |
%210 = torch_c.from_builtin_tensor %__auto.blk.8.ffn_gate.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.8.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.8.ffn_up.q_input:rscale" : tensor<f32> | |
%211 = torch_c.from_builtin_tensor %__auto.blk.8.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.8.ffn_up.weight3Aqs = util.global.load @"__auto.blk.8.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ> | |
%212 = torch_c.from_builtin_tensor %__auto.blk.8.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ> | |
%__auto.blk.8.ffn_up.weight3Ad = util.global.load @"__auto.blk.8.ffn_up.weight:d" : tensor<f32> | |
%213 = torch_c.from_builtin_tensor %__auto.blk.8.ffn_up.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.8.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.8.ffn_down.q_input:rscale" : tensor<f32> | |
%214 = torch_c.from_builtin_tensor %__auto.blk.8.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.8.ffn_down.weight3Aqs = util.global.load @"__auto.blk.8.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ> | |
%215 = torch_c.from_builtin_tensor %__auto.blk.8.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ> | |
%__auto.blk.8.ffn_down.weight3Ad = util.global.load @"__auto.blk.8.ffn_down.weight:d" : tensor<f32> | |
%216 = torch_c.from_builtin_tensor %__auto.blk.8.ffn_down.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.9.attn_norm.weight = util.global.load @__auto.blk.9.attn_norm.weight : tensor<4096xbf16> | |
%217 = torch_c.from_builtin_tensor %__auto.blk.9.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16> | |
%__auto.blk.9.attn_q.q_input3Arscale = util.global.load @"__auto.blk.9.attn_q.q_input:rscale" : tensor<f32> | |
%218 = torch_c.from_builtin_tensor %__auto.blk.9.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.9.attn_q.weight3Aqs = util.global.load @"__auto.blk.9.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ> | |
%219 = torch_c.from_builtin_tensor %__auto.blk.9.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ> | |
%__auto.blk.9.attn_q.q_output3Arscale = util.global.load @"__auto.blk.9.attn_q.q_output:rscale" : tensor<f32> | |
%220 = torch_c.from_builtin_tensor %__auto.blk.9.attn_q.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.9.attn_k.q_input3Arscale = util.global.load @"__auto.blk.9.attn_k.q_input:rscale" : tensor<f32> | |
%221 = torch_c.from_builtin_tensor %__auto.blk.9.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.9.attn_k.weight3Aqs = util.global.load @"__auto.blk.9.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ> | |
%222 = torch_c.from_builtin_tensor %__auto.blk.9.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ> | |
%__auto.blk.9.attn_k.q_output3Arscale = util.global.load @"__auto.blk.9.attn_k.q_output:rscale" : tensor<f32> | |
%223 = torch_c.from_builtin_tensor %__auto.blk.9.attn_k.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.9.attn_v.q_input3Arscale = util.global.load @"__auto.blk.9.attn_v.q_input:rscale" : tensor<f32> | |
%224 = torch_c.from_builtin_tensor %__auto.blk.9.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.9.attn_v.weight3Aqs = util.global.load @"__auto.blk.9.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ> | |
%225 = torch_c.from_builtin_tensor %__auto.blk.9.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ> | |
%__auto.blk.9.attn_v.q_output3Arscale = util.global.load @"__auto.blk.9.attn_v.q_output:rscale" : tensor<f32> | |
%226 = torch_c.from_builtin_tensor %__auto.blk.9.attn_v.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%227 = torch.vtensor.literal(dense<0.0883883461> : tensor<f32>) : !torch.vtensor<[],f32> | |
%__auto.blk.9.attn_output.q_input3Arscale = util.global.load @"__auto.blk.9.attn_output.q_input:rscale" : tensor<f32> | |
%228 = torch_c.from_builtin_tensor %__auto.blk.9.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.9.attn_output.weight3Aqs = util.global.load @"__auto.blk.9.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ> | |
%229 = torch_c.from_builtin_tensor %__auto.blk.9.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ> | |
%__auto.blk.9.attn_output.weight3Ad = util.global.load @"__auto.blk.9.attn_output.weight:d" : tensor<f32> | |
%230 = torch_c.from_builtin_tensor %__auto.blk.9.attn_output.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.9.ffn_norm.weight = util.global.load @__auto.blk.9.ffn_norm.weight : tensor<4096xbf16> | |
%231 = torch_c.from_builtin_tensor %__auto.blk.9.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16> | |
%__auto.blk.9.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.9.ffn_gate.q_input:rscale" : tensor<f32> | |
%232 = torch_c.from_builtin_tensor %__auto.blk.9.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.9.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.9.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ> | |
%233 = torch_c.from_builtin_tensor %__auto.blk.9.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ> | |
%__auto.blk.9.ffn_gate.weight3Ad = util.global.load @"__auto.blk.9.ffn_gate.weight:d" : tensor<f32> | |
%234 = torch_c.from_builtin_tensor %__auto.blk.9.ffn_gate.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.9.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.9.ffn_up.q_input:rscale" : tensor<f32> | |
%235 = torch_c.from_builtin_tensor %__auto.blk.9.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.9.ffn_up.weight3Aqs = util.global.load @"__auto.blk.9.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ> | |
%236 = torch_c.from_builtin_tensor %__auto.blk.9.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ> | |
%__auto.blk.9.ffn_up.weight3Ad = util.global.load @"__auto.blk.9.ffn_up.weight:d" : tensor<f32> | |
%237 = torch_c.from_builtin_tensor %__auto.blk.9.ffn_up.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.9.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.9.ffn_down.q_input:rscale" : tensor<f32> | |
%238 = torch_c.from_builtin_tensor %__auto.blk.9.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.9.ffn_down.weight3Aqs = util.global.load @"__auto.blk.9.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ> | |
%239 = torch_c.from_builtin_tensor %__auto.blk.9.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ> | |
%__auto.blk.9.ffn_down.weight3Ad = util.global.load @"__auto.blk.9.ffn_down.weight:d" : tensor<f32> | |
%240 = torch_c.from_builtin_tensor %__auto.blk.9.ffn_down.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.10.attn_norm.weight = util.global.load @__auto.blk.10.attn_norm.weight : tensor<4096xbf16> | |
%241 = torch_c.from_builtin_tensor %__auto.blk.10.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16> | |
%__auto.blk.10.attn_q.q_input3Arscale = util.global.load @"__auto.blk.10.attn_q.q_input:rscale" : tensor<f32> | |
%242 = torch_c.from_builtin_tensor %__auto.blk.10.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.10.attn_q.weight3Aqs = util.global.load @"__auto.blk.10.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ> | |
%243 = torch_c.from_builtin_tensor %__auto.blk.10.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ> | |
%__auto.blk.10.attn_q.q_output3Arscale = util.global.load @"__auto.blk.10.attn_q.q_output:rscale" : tensor<f32> | |
%244 = torch_c.from_builtin_tensor %__auto.blk.10.attn_q.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.10.attn_k.q_input3Arscale = util.global.load @"__auto.blk.10.attn_k.q_input:rscale" : tensor<f32> | |
%245 = torch_c.from_builtin_tensor %__auto.blk.10.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.10.attn_k.weight3Aqs = util.global.load @"__auto.blk.10.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ> | |
%246 = torch_c.from_builtin_tensor %__auto.blk.10.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ> | |
%__auto.blk.10.attn_k.q_output3Arscale = util.global.load @"__auto.blk.10.attn_k.q_output:rscale" : tensor<f32> | |
%247 = torch_c.from_builtin_tensor %__auto.blk.10.attn_k.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.10.attn_v.q_input3Arscale = util.global.load @"__auto.blk.10.attn_v.q_input:rscale" : tensor<f32> | |
%248 = torch_c.from_builtin_tensor %__auto.blk.10.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.10.attn_v.weight3Aqs = util.global.load @"__auto.blk.10.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ> | |
%249 = torch_c.from_builtin_tensor %__auto.blk.10.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ> | |
%__auto.blk.10.attn_v.q_output3Arscale = util.global.load @"__auto.blk.10.attn_v.q_output:rscale" : tensor<f32> | |
%250 = torch_c.from_builtin_tensor %__auto.blk.10.attn_v.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%251 = torch.vtensor.literal(dense<0.0883883461> : tensor<f32>) : !torch.vtensor<[],f32> | |
%__auto.blk.10.attn_output.q_input3Arscale = util.global.load @"__auto.blk.10.attn_output.q_input:rscale" : tensor<f32> | |
%252 = torch_c.from_builtin_tensor %__auto.blk.10.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.10.attn_output.weight3Aqs = util.global.load @"__auto.blk.10.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ> | |
%253 = torch_c.from_builtin_tensor %__auto.blk.10.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ> | |
%__auto.blk.10.attn_output.weight3Ad = util.global.load @"__auto.blk.10.attn_output.weight:d" : tensor<f32> | |
%254 = torch_c.from_builtin_tensor %__auto.blk.10.attn_output.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.10.ffn_norm.weight = util.global.load @__auto.blk.10.ffn_norm.weight : tensor<4096xbf16> | |
%255 = torch_c.from_builtin_tensor %__auto.blk.10.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16> | |
%__auto.blk.10.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.10.ffn_gate.q_input:rscale" : tensor<f32> | |
%256 = torch_c.from_builtin_tensor %__auto.blk.10.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.10.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.10.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ> | |
%257 = torch_c.from_builtin_tensor %__auto.blk.10.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ> | |
%__auto.blk.10.ffn_gate.weight3Ad = util.global.load @"__auto.blk.10.ffn_gate.weight:d" : tensor<f32> | |
%258 = torch_c.from_builtin_tensor %__auto.blk.10.ffn_gate.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.10.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.10.ffn_up.q_input:rscale" : tensor<f32> | |
%259 = torch_c.from_builtin_tensor %__auto.blk.10.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.10.ffn_up.weight3Aqs = util.global.load @"__auto.blk.10.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ> | |
%260 = torch_c.from_builtin_tensor %__auto.blk.10.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ> | |
%__auto.blk.10.ffn_up.weight3Ad = util.global.load @"__auto.blk.10.ffn_up.weight:d" : tensor<f32> | |
%261 = torch_c.from_builtin_tensor %__auto.blk.10.ffn_up.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.10.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.10.ffn_down.q_input:rscale" : tensor<f32> | |
%262 = torch_c.from_builtin_tensor %__auto.blk.10.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.10.ffn_down.weight3Aqs = util.global.load @"__auto.blk.10.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ> | |
%263 = torch_c.from_builtin_tensor %__auto.blk.10.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ> | |
%__auto.blk.10.ffn_down.weight3Ad = util.global.load @"__auto.blk.10.ffn_down.weight:d" : tensor<f32> | |
%264 = torch_c.from_builtin_tensor %__auto.blk.10.ffn_down.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.11.attn_norm.weight = util.global.load @__auto.blk.11.attn_norm.weight : tensor<4096xbf16> | |
%265 = torch_c.from_builtin_tensor %__auto.blk.11.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16> | |
%__auto.blk.11.attn_q.q_input3Arscale = util.global.load @"__auto.blk.11.attn_q.q_input:rscale" : tensor<f32> | |
%266 = torch_c.from_builtin_tensor %__auto.blk.11.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.11.attn_q.weight3Aqs = util.global.load @"__auto.blk.11.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ> | |
%267 = torch_c.from_builtin_tensor %__auto.blk.11.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ> | |
%__auto.blk.11.attn_q.q_output3Arscale = util.global.load @"__auto.blk.11.attn_q.q_output:rscale" : tensor<f32> | |
%268 = torch_c.from_builtin_tensor %__auto.blk.11.attn_q.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.11.attn_k.q_input3Arscale = util.global.load @"__auto.blk.11.attn_k.q_input:rscale" : tensor<f32> | |
%269 = torch_c.from_builtin_tensor %__auto.blk.11.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.11.attn_k.weight3Aqs = util.global.load @"__auto.blk.11.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ> | |
%270 = torch_c.from_builtin_tensor %__auto.blk.11.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ> | |
%__auto.blk.11.attn_k.q_output3Arscale = util.global.load @"__auto.blk.11.attn_k.q_output:rscale" : tensor<f32> | |
%271 = torch_c.from_builtin_tensor %__auto.blk.11.attn_k.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.11.attn_v.q_input3Arscale = util.global.load @"__auto.blk.11.attn_v.q_input:rscale" : tensor<f32> | |
%272 = torch_c.from_builtin_tensor %__auto.blk.11.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.11.attn_v.weight3Aqs = util.global.load @"__auto.blk.11.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ> | |
%273 = torch_c.from_builtin_tensor %__auto.blk.11.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ> | |
%__auto.blk.11.attn_v.q_output3Arscale = util.global.load @"__auto.blk.11.attn_v.q_output:rscale" : tensor<f32> | |
%274 = torch_c.from_builtin_tensor %__auto.blk.11.attn_v.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%275 = torch.vtensor.literal(dense<0.0883883461> : tensor<f32>) : !torch.vtensor<[],f32> | |
%__auto.blk.11.attn_output.q_input3Arscale = util.global.load @"__auto.blk.11.attn_output.q_input:rscale" : tensor<f32> | |
%276 = torch_c.from_builtin_tensor %__auto.blk.11.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.11.attn_output.weight3Aqs = util.global.load @"__auto.blk.11.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ> | |
%277 = torch_c.from_builtin_tensor %__auto.blk.11.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ> | |
%__auto.blk.11.attn_output.weight3Ad = util.global.load @"__auto.blk.11.attn_output.weight:d" : tensor<f32> | |
%278 = torch_c.from_builtin_tensor %__auto.blk.11.attn_output.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.11.ffn_norm.weight = util.global.load @__auto.blk.11.ffn_norm.weight : tensor<4096xbf16> | |
%279 = torch_c.from_builtin_tensor %__auto.blk.11.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16> | |
%__auto.blk.11.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.11.ffn_gate.q_input:rscale" : tensor<f32> | |
%280 = torch_c.from_builtin_tensor %__auto.blk.11.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.11.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.11.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ> | |
%281 = torch_c.from_builtin_tensor %__auto.blk.11.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ> | |
%__auto.blk.11.ffn_gate.weight3Ad = util.global.load @"__auto.blk.11.ffn_gate.weight:d" : tensor<f32> | |
%282 = torch_c.from_builtin_tensor %__auto.blk.11.ffn_gate.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.11.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.11.ffn_up.q_input:rscale" : tensor<f32> | |
%283 = torch_c.from_builtin_tensor %__auto.blk.11.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.11.ffn_up.weight3Aqs = util.global.load @"__auto.blk.11.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ> | |
%284 = torch_c.from_builtin_tensor %__auto.blk.11.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ> | |
%__auto.blk.11.ffn_up.weight3Ad = util.global.load @"__auto.blk.11.ffn_up.weight:d" : tensor<f32> | |
%285 = torch_c.from_builtin_tensor %__auto.blk.11.ffn_up.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.11.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.11.ffn_down.q_input:rscale" : tensor<f32> | |
%286 = torch_c.from_builtin_tensor %__auto.blk.11.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.11.ffn_down.weight3Aqs = util.global.load @"__auto.blk.11.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ> | |
%287 = torch_c.from_builtin_tensor %__auto.blk.11.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ> | |
%__auto.blk.11.ffn_down.weight3Ad = util.global.load @"__auto.blk.11.ffn_down.weight:d" : tensor<f32> | |
%288 = torch_c.from_builtin_tensor %__auto.blk.11.ffn_down.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.12.attn_norm.weight = util.global.load @__auto.blk.12.attn_norm.weight : tensor<4096xbf16> | |
%289 = torch_c.from_builtin_tensor %__auto.blk.12.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16> | |
%__auto.blk.12.attn_q.q_input3Arscale = util.global.load @"__auto.blk.12.attn_q.q_input:rscale" : tensor<f32> | |
%290 = torch_c.from_builtin_tensor %__auto.blk.12.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.12.attn_q.weight3Aqs = util.global.load @"__auto.blk.12.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ> | |
%291 = torch_c.from_builtin_tensor %__auto.blk.12.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ> | |
%__auto.blk.12.attn_q.q_output3Arscale = util.global.load @"__auto.blk.12.attn_q.q_output:rscale" : tensor<f32> | |
%292 = torch_c.from_builtin_tensor %__auto.blk.12.attn_q.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.12.attn_k.q_input3Arscale = util.global.load @"__auto.blk.12.attn_k.q_input:rscale" : tensor<f32> | |
%293 = torch_c.from_builtin_tensor %__auto.blk.12.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.12.attn_k.weight3Aqs = util.global.load @"__auto.blk.12.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ> | |
%294 = torch_c.from_builtin_tensor %__auto.blk.12.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ> | |
%__auto.blk.12.attn_k.q_output3Arscale = util.global.load @"__auto.blk.12.attn_k.q_output:rscale" : tensor<f32> | |
%295 = torch_c.from_builtin_tensor %__auto.blk.12.attn_k.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.12.attn_v.q_input3Arscale = util.global.load @"__auto.blk.12.attn_v.q_input:rscale" : tensor<f32> | |
%296 = torch_c.from_builtin_tensor %__auto.blk.12.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.12.attn_v.weight3Aqs = util.global.load @"__auto.blk.12.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ> | |
%297 = torch_c.from_builtin_tensor %__auto.blk.12.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ> | |
%__auto.blk.12.attn_v.q_output3Arscale = util.global.load @"__auto.blk.12.attn_v.q_output:rscale" : tensor<f32> | |
%298 = torch_c.from_builtin_tensor %__auto.blk.12.attn_v.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%299 = torch.vtensor.literal(dense<0.0883883461> : tensor<f32>) : !torch.vtensor<[],f32> | |
%__auto.blk.12.attn_output.q_input3Arscale = util.global.load @"__auto.blk.12.attn_output.q_input:rscale" : tensor<f32> | |
%300 = torch_c.from_builtin_tensor %__auto.blk.12.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.12.attn_output.weight3Aqs = util.global.load @"__auto.blk.12.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ> | |
%301 = torch_c.from_builtin_tensor %__auto.blk.12.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ> | |
%__auto.blk.12.attn_output.weight3Ad = util.global.load @"__auto.blk.12.attn_output.weight:d" : tensor<f32> | |
%302 = torch_c.from_builtin_tensor %__auto.blk.12.attn_output.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.12.ffn_norm.weight = util.global.load @__auto.blk.12.ffn_norm.weight : tensor<4096xbf16> | |
%303 = torch_c.from_builtin_tensor %__auto.blk.12.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16> | |
%__auto.blk.12.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.12.ffn_gate.q_input:rscale" : tensor<f32> | |
%304 = torch_c.from_builtin_tensor %__auto.blk.12.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.12.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.12.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ> | |
%305 = torch_c.from_builtin_tensor %__auto.blk.12.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ> | |
%__auto.blk.12.ffn_gate.weight3Ad = util.global.load @"__auto.blk.12.ffn_gate.weight:d" : tensor<f32> | |
%306 = torch_c.from_builtin_tensor %__auto.blk.12.ffn_gate.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.12.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.12.ffn_up.q_input:rscale" : tensor<f32> | |
%307 = torch_c.from_builtin_tensor %__auto.blk.12.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.12.ffn_up.weight3Aqs = util.global.load @"__auto.blk.12.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ> | |
%308 = torch_c.from_builtin_tensor %__auto.blk.12.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ> | |
%__auto.blk.12.ffn_up.weight3Ad = util.global.load @"__auto.blk.12.ffn_up.weight:d" : tensor<f32> | |
%309 = torch_c.from_builtin_tensor %__auto.blk.12.ffn_up.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.12.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.12.ffn_down.q_input:rscale" : tensor<f32> | |
%310 = torch_c.from_builtin_tensor %__auto.blk.12.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.12.ffn_down.weight3Aqs = util.global.load @"__auto.blk.12.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ> | |
%311 = torch_c.from_builtin_tensor %__auto.blk.12.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ> | |
%__auto.blk.12.ffn_down.weight3Ad = util.global.load @"__auto.blk.12.ffn_down.weight:d" : tensor<f32> | |
%312 = torch_c.from_builtin_tensor %__auto.blk.12.ffn_down.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.13.attn_norm.weight = util.global.load @__auto.blk.13.attn_norm.weight : tensor<4096xbf16> | |
%313 = torch_c.from_builtin_tensor %__auto.blk.13.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16> | |
%__auto.blk.13.attn_q.q_input3Arscale = util.global.load @"__auto.blk.13.attn_q.q_input:rscale" : tensor<f32> | |
%314 = torch_c.from_builtin_tensor %__auto.blk.13.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.13.attn_q.weight3Aqs = util.global.load @"__auto.blk.13.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ> | |
%315 = torch_c.from_builtin_tensor %__auto.blk.13.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ> | |
%__auto.blk.13.attn_q.q_output3Arscale = util.global.load @"__auto.blk.13.attn_q.q_output:rscale" : tensor<f32> | |
%316 = torch_c.from_builtin_tensor %__auto.blk.13.attn_q.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.13.attn_k.q_input3Arscale = util.global.load @"__auto.blk.13.attn_k.q_input:rscale" : tensor<f32> | |
%317 = torch_c.from_builtin_tensor %__auto.blk.13.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.13.attn_k.weight3Aqs = util.global.load @"__auto.blk.13.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ> | |
%318 = torch_c.from_builtin_tensor %__auto.blk.13.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ> | |
%__auto.blk.13.attn_k.q_output3Arscale = util.global.load @"__auto.blk.13.attn_k.q_output:rscale" : tensor<f32> | |
%319 = torch_c.from_builtin_tensor %__auto.blk.13.attn_k.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.13.attn_v.q_input3Arscale = util.global.load @"__auto.blk.13.attn_v.q_input:rscale" : tensor<f32> | |
%320 = torch_c.from_builtin_tensor %__auto.blk.13.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.13.attn_v.weight3Aqs = util.global.load @"__auto.blk.13.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ> | |
%321 = torch_c.from_builtin_tensor %__auto.blk.13.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ> | |
%__auto.blk.13.attn_v.q_output3Arscale = util.global.load @"__auto.blk.13.attn_v.q_output:rscale" : tensor<f32> | |
%322 = torch_c.from_builtin_tensor %__auto.blk.13.attn_v.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%323 = torch.vtensor.literal(dense<0.0883883461> : tensor<f32>) : !torch.vtensor<[],f32> | |
%__auto.blk.13.attn_output.q_input3Arscale = util.global.load @"__auto.blk.13.attn_output.q_input:rscale" : tensor<f32> | |
%324 = torch_c.from_builtin_tensor %__auto.blk.13.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.13.attn_output.weight3Aqs = util.global.load @"__auto.blk.13.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ> | |
%325 = torch_c.from_builtin_tensor %__auto.blk.13.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ> | |
%__auto.blk.13.attn_output.weight3Ad = util.global.load @"__auto.blk.13.attn_output.weight:d" : tensor<f32> | |
%326 = torch_c.from_builtin_tensor %__auto.blk.13.attn_output.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.13.ffn_norm.weight = util.global.load @__auto.blk.13.ffn_norm.weight : tensor<4096xbf16> | |
%327 = torch_c.from_builtin_tensor %__auto.blk.13.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16> | |
%__auto.blk.13.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.13.ffn_gate.q_input:rscale" : tensor<f32> | |
%328 = torch_c.from_builtin_tensor %__auto.blk.13.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.13.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.13.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ> | |
%329 = torch_c.from_builtin_tensor %__auto.blk.13.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ> | |
%__auto.blk.13.ffn_gate.weight3Ad = util.global.load @"__auto.blk.13.ffn_gate.weight:d" : tensor<f32> | |
%330 = torch_c.from_builtin_tensor %__auto.blk.13.ffn_gate.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.13.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.13.ffn_up.q_input:rscale" : tensor<f32> | |
%331 = torch_c.from_builtin_tensor %__auto.blk.13.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.13.ffn_up.weight3Aqs = util.global.load @"__auto.blk.13.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ> | |
%332 = torch_c.from_builtin_tensor %__auto.blk.13.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ> | |
%__auto.blk.13.ffn_up.weight3Ad = util.global.load @"__auto.blk.13.ffn_up.weight:d" : tensor<f32> | |
%333 = torch_c.from_builtin_tensor %__auto.blk.13.ffn_up.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.13.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.13.ffn_down.q_input:rscale" : tensor<f32> | |
%334 = torch_c.from_builtin_tensor %__auto.blk.13.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.13.ffn_down.weight3Aqs = util.global.load @"__auto.blk.13.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ> | |
%335 = torch_c.from_builtin_tensor %__auto.blk.13.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ> | |
%__auto.blk.13.ffn_down.weight3Ad = util.global.load @"__auto.blk.13.ffn_down.weight:d" : tensor<f32> | |
%336 = torch_c.from_builtin_tensor %__auto.blk.13.ffn_down.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.14.attn_norm.weight = util.global.load @__auto.blk.14.attn_norm.weight : tensor<4096xbf16> | |
%337 = torch_c.from_builtin_tensor %__auto.blk.14.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16> | |
%__auto.blk.14.attn_q.q_input3Arscale = util.global.load @"__auto.blk.14.attn_q.q_input:rscale" : tensor<f32> | |
%338 = torch_c.from_builtin_tensor %__auto.blk.14.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.14.attn_q.weight3Aqs = util.global.load @"__auto.blk.14.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ> | |
%339 = torch_c.from_builtin_tensor %__auto.blk.14.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ> | |
%__auto.blk.14.attn_q.q_output3Arscale = util.global.load @"__auto.blk.14.attn_q.q_output:rscale" : tensor<f32> | |
%340 = torch_c.from_builtin_tensor %__auto.blk.14.attn_q.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.14.attn_k.q_input3Arscale = util.global.load @"__auto.blk.14.attn_k.q_input:rscale" : tensor<f32> | |
%341 = torch_c.from_builtin_tensor %__auto.blk.14.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.14.attn_k.weight3Aqs = util.global.load @"__auto.blk.14.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ> | |
%342 = torch_c.from_builtin_tensor %__auto.blk.14.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ> | |
%__auto.blk.14.attn_k.q_output3Arscale = util.global.load @"__auto.blk.14.attn_k.q_output:rscale" : tensor<f32> | |
%343 = torch_c.from_builtin_tensor %__auto.blk.14.attn_k.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.14.attn_v.q_input3Arscale = util.global.load @"__auto.blk.14.attn_v.q_input:rscale" : tensor<f32> | |
%344 = torch_c.from_builtin_tensor %__auto.blk.14.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.14.attn_v.weight3Aqs = util.global.load @"__auto.blk.14.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ> | |
%345 = torch_c.from_builtin_tensor %__auto.blk.14.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ> | |
%__auto.blk.14.attn_v.q_output3Arscale = util.global.load @"__auto.blk.14.attn_v.q_output:rscale" : tensor<f32> | |
%346 = torch_c.from_builtin_tensor %__auto.blk.14.attn_v.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%347 = torch.vtensor.literal(dense<0.0883883461> : tensor<f32>) : !torch.vtensor<[],f32> | |
%__auto.blk.14.attn_output.q_input3Arscale = util.global.load @"__auto.blk.14.attn_output.q_input:rscale" : tensor<f32> | |
%348 = torch_c.from_builtin_tensor %__auto.blk.14.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.14.attn_output.weight3Aqs = util.global.load @"__auto.blk.14.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ> | |
%349 = torch_c.from_builtin_tensor %__auto.blk.14.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ> | |
%__auto.blk.14.attn_output.weight3Ad = util.global.load @"__auto.blk.14.attn_output.weight:d" : tensor<f32> | |
%350 = torch_c.from_builtin_tensor %__auto.blk.14.attn_output.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.14.ffn_norm.weight = util.global.load @__auto.blk.14.ffn_norm.weight : tensor<4096xbf16> | |
%351 = torch_c.from_builtin_tensor %__auto.blk.14.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16> | |
%__auto.blk.14.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.14.ffn_gate.q_input:rscale" : tensor<f32> | |
%352 = torch_c.from_builtin_tensor %__auto.blk.14.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.14.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.14.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ> | |
%353 = torch_c.from_builtin_tensor %__auto.blk.14.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ> | |
%__auto.blk.14.ffn_gate.weight3Ad = util.global.load @"__auto.blk.14.ffn_gate.weight:d" : tensor<f32> | |
%354 = torch_c.from_builtin_tensor %__auto.blk.14.ffn_gate.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.14.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.14.ffn_up.q_input:rscale" : tensor<f32> | |
%355 = torch_c.from_builtin_tensor %__auto.blk.14.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.14.ffn_up.weight3Aqs = util.global.load @"__auto.blk.14.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ> | |
%356 = torch_c.from_builtin_tensor %__auto.blk.14.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ> | |
%__auto.blk.14.ffn_up.weight3Ad = util.global.load @"__auto.blk.14.ffn_up.weight:d" : tensor<f32> | |
%357 = torch_c.from_builtin_tensor %__auto.blk.14.ffn_up.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.14.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.14.ffn_down.q_input:rscale" : tensor<f32> | |
%358 = torch_c.from_builtin_tensor %__auto.blk.14.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.14.ffn_down.weight3Aqs = util.global.load @"__auto.blk.14.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ> | |
%359 = torch_c.from_builtin_tensor %__auto.blk.14.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ> | |
%__auto.blk.14.ffn_down.weight3Ad = util.global.load @"__auto.blk.14.ffn_down.weight:d" : tensor<f32> | |
%360 = torch_c.from_builtin_tensor %__auto.blk.14.ffn_down.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.15.attn_norm.weight = util.global.load @__auto.blk.15.attn_norm.weight : tensor<4096xbf16> | |
%361 = torch_c.from_builtin_tensor %__auto.blk.15.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16> | |
%__auto.blk.15.attn_q.q_input3Arscale = util.global.load @"__auto.blk.15.attn_q.q_input:rscale" : tensor<f32> | |
%362 = torch_c.from_builtin_tensor %__auto.blk.15.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.15.attn_q.weight3Aqs = util.global.load @"__auto.blk.15.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ> | |
%363 = torch_c.from_builtin_tensor %__auto.blk.15.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ> | |
%__auto.blk.15.attn_q.q_output3Arscale = util.global.load @"__auto.blk.15.attn_q.q_output:rscale" : tensor<f32> | |
%364 = torch_c.from_builtin_tensor %__auto.blk.15.attn_q.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.15.attn_k.q_input3Arscale = util.global.load @"__auto.blk.15.attn_k.q_input:rscale" : tensor<f32> | |
%365 = torch_c.from_builtin_tensor %__auto.blk.15.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.15.attn_k.weight3Aqs = util.global.load @"__auto.blk.15.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ> | |
%366 = torch_c.from_builtin_tensor %__auto.blk.15.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ> | |
%__auto.blk.15.attn_k.q_output3Arscale = util.global.load @"__auto.blk.15.attn_k.q_output:rscale" : tensor<f32> | |
%367 = torch_c.from_builtin_tensor %__auto.blk.15.attn_k.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.15.attn_v.q_input3Arscale = util.global.load @"__auto.blk.15.attn_v.q_input:rscale" : tensor<f32> | |
%368 = torch_c.from_builtin_tensor %__auto.blk.15.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.15.attn_v.weight3Aqs = util.global.load @"__auto.blk.15.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ> | |
%369 = torch_c.from_builtin_tensor %__auto.blk.15.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ> | |
%__auto.blk.15.attn_v.q_output3Arscale = util.global.load @"__auto.blk.15.attn_v.q_output:rscale" : tensor<f32> | |
%370 = torch_c.from_builtin_tensor %__auto.blk.15.attn_v.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%371 = torch.vtensor.literal(dense<0.0883883461> : tensor<f32>) : !torch.vtensor<[],f32> | |
%__auto.blk.15.attn_output.q_input3Arscale = util.global.load @"__auto.blk.15.attn_output.q_input:rscale" : tensor<f32> | |
%372 = torch_c.from_builtin_tensor %__auto.blk.15.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.15.attn_output.weight3Aqs = util.global.load @"__auto.blk.15.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ> | |
%373 = torch_c.from_builtin_tensor %__auto.blk.15.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ> | |
%__auto.blk.15.attn_output.weight3Ad = util.global.load @"__auto.blk.15.attn_output.weight:d" : tensor<f32> | |
%374 = torch_c.from_builtin_tensor %__auto.blk.15.attn_output.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.15.ffn_norm.weight = util.global.load @__auto.blk.15.ffn_norm.weight : tensor<4096xbf16> | |
%375 = torch_c.from_builtin_tensor %__auto.blk.15.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16> | |
%__auto.blk.15.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.15.ffn_gate.q_input:rscale" : tensor<f32> | |
%376 = torch_c.from_builtin_tensor %__auto.blk.15.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.15.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.15.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ> | |
%377 = torch_c.from_builtin_tensor %__auto.blk.15.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ> | |
%__auto.blk.15.ffn_gate.weight3Ad = util.global.load @"__auto.blk.15.ffn_gate.weight:d" : tensor<f32> | |
%378 = torch_c.from_builtin_tensor %__auto.blk.15.ffn_gate.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.15.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.15.ffn_up.q_input:rscale" : tensor<f32> | |
%379 = torch_c.from_builtin_tensor %__auto.blk.15.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.15.ffn_up.weight3Aqs = util.global.load @"__auto.blk.15.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ> | |
%380 = torch_c.from_builtin_tensor %__auto.blk.15.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ> | |
%__auto.blk.15.ffn_up.weight3Ad = util.global.load @"__auto.blk.15.ffn_up.weight:d" : tensor<f32> | |
%381 = torch_c.from_builtin_tensor %__auto.blk.15.ffn_up.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.15.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.15.ffn_down.q_input:rscale" : tensor<f32> | |
%382 = torch_c.from_builtin_tensor %__auto.blk.15.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.15.ffn_down.weight3Aqs = util.global.load @"__auto.blk.15.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ> | |
%383 = torch_c.from_builtin_tensor %__auto.blk.15.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ> | |
%__auto.blk.15.ffn_down.weight3Ad = util.global.load @"__auto.blk.15.ffn_down.weight:d" : tensor<f32> | |
%384 = torch_c.from_builtin_tensor %__auto.blk.15.ffn_down.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.16.attn_norm.weight = util.global.load @__auto.blk.16.attn_norm.weight : tensor<4096xbf16> | |
%385 = torch_c.from_builtin_tensor %__auto.blk.16.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16> | |
%__auto.blk.16.attn_q.q_input3Arscale = util.global.load @"__auto.blk.16.attn_q.q_input:rscale" : tensor<f32> | |
%386 = torch_c.from_builtin_tensor %__auto.blk.16.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.16.attn_q.weight3Aqs = util.global.load @"__auto.blk.16.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ> | |
%387 = torch_c.from_builtin_tensor %__auto.blk.16.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ> | |
%__auto.blk.16.attn_q.q_output3Arscale = util.global.load @"__auto.blk.16.attn_q.q_output:rscale" : tensor<f32> | |
%388 = torch_c.from_builtin_tensor %__auto.blk.16.attn_q.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.16.attn_k.q_input3Arscale = util.global.load @"__auto.blk.16.attn_k.q_input:rscale" : tensor<f32> | |
%389 = torch_c.from_builtin_tensor %__auto.blk.16.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.16.attn_k.weight3Aqs = util.global.load @"__auto.blk.16.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ> | |
%390 = torch_c.from_builtin_tensor %__auto.blk.16.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ> | |
%__auto.blk.16.attn_k.q_output3Arscale = util.global.load @"__auto.blk.16.attn_k.q_output:rscale" : tensor<f32> | |
%391 = torch_c.from_builtin_tensor %__auto.blk.16.attn_k.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.16.attn_v.q_input3Arscale = util.global.load @"__auto.blk.16.attn_v.q_input:rscale" : tensor<f32> | |
%392 = torch_c.from_builtin_tensor %__auto.blk.16.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.16.attn_v.weight3Aqs = util.global.load @"__auto.blk.16.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ> | |
%393 = torch_c.from_builtin_tensor %__auto.blk.16.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ> | |
%__auto.blk.16.attn_v.q_output3Arscale = util.global.load @"__auto.blk.16.attn_v.q_output:rscale" : tensor<f32> | |
%394 = torch_c.from_builtin_tensor %__auto.blk.16.attn_v.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%395 = torch.vtensor.literal(dense<0.0883883461> : tensor<f32>) : !torch.vtensor<[],f32> | |
%__auto.blk.16.attn_output.q_input3Arscale = util.global.load @"__auto.blk.16.attn_output.q_input:rscale" : tensor<f32> | |
%396 = torch_c.from_builtin_tensor %__auto.blk.16.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.16.attn_output.weight3Aqs = util.global.load @"__auto.blk.16.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ> | |
%397 = torch_c.from_builtin_tensor %__auto.blk.16.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ> | |
%__auto.blk.16.attn_output.weight3Ad = util.global.load @"__auto.blk.16.attn_output.weight:d" : tensor<f32> | |
%398 = torch_c.from_builtin_tensor %__auto.blk.16.attn_output.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.16.ffn_norm.weight = util.global.load @__auto.blk.16.ffn_norm.weight : tensor<4096xbf16> | |
%399 = torch_c.from_builtin_tensor %__auto.blk.16.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16> | |
%__auto.blk.16.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.16.ffn_gate.q_input:rscale" : tensor<f32> | |
%400 = torch_c.from_builtin_tensor %__auto.blk.16.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.16.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.16.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ> | |
%401 = torch_c.from_builtin_tensor %__auto.blk.16.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ> | |
%__auto.blk.16.ffn_gate.weight3Ad = util.global.load @"__auto.blk.16.ffn_gate.weight:d" : tensor<f32> | |
%402 = torch_c.from_builtin_tensor %__auto.blk.16.ffn_gate.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.16.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.16.ffn_up.q_input:rscale" : tensor<f32> | |
%403 = torch_c.from_builtin_tensor %__auto.blk.16.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.16.ffn_up.weight3Aqs = util.global.load @"__auto.blk.16.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ> | |
%404 = torch_c.from_builtin_tensor %__auto.blk.16.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ> | |
%__auto.blk.16.ffn_up.weight3Ad = util.global.load @"__auto.blk.16.ffn_up.weight:d" : tensor<f32> | |
%405 = torch_c.from_builtin_tensor %__auto.blk.16.ffn_up.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.16.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.16.ffn_down.q_input:rscale" : tensor<f32> | |
%406 = torch_c.from_builtin_tensor %__auto.blk.16.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.16.ffn_down.weight3Aqs = util.global.load @"__auto.blk.16.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ> | |
%407 = torch_c.from_builtin_tensor %__auto.blk.16.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ> | |
%__auto.blk.16.ffn_down.weight3Ad = util.global.load @"__auto.blk.16.ffn_down.weight:d" : tensor<f32> | |
%408 = torch_c.from_builtin_tensor %__auto.blk.16.ffn_down.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.17.attn_norm.weight = util.global.load @__auto.blk.17.attn_norm.weight : tensor<4096xbf16> | |
%409 = torch_c.from_builtin_tensor %__auto.blk.17.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16> | |
%__auto.blk.17.attn_q.q_input3Arscale = util.global.load @"__auto.blk.17.attn_q.q_input:rscale" : tensor<f32> | |
%410 = torch_c.from_builtin_tensor %__auto.blk.17.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.17.attn_q.weight3Aqs = util.global.load @"__auto.blk.17.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ> | |
%411 = torch_c.from_builtin_tensor %__auto.blk.17.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ> | |
%__auto.blk.17.attn_q.q_output3Arscale = util.global.load @"__auto.blk.17.attn_q.q_output:rscale" : tensor<f32> | |
%412 = torch_c.from_builtin_tensor %__auto.blk.17.attn_q.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.17.attn_k.q_input3Arscale = util.global.load @"__auto.blk.17.attn_k.q_input:rscale" : tensor<f32> | |
%413 = torch_c.from_builtin_tensor %__auto.blk.17.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.17.attn_k.weight3Aqs = util.global.load @"__auto.blk.17.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ> | |
%414 = torch_c.from_builtin_tensor %__auto.blk.17.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ> | |
%__auto.blk.17.attn_k.q_output3Arscale = util.global.load @"__auto.blk.17.attn_k.q_output:rscale" : tensor<f32> | |
%415 = torch_c.from_builtin_tensor %__auto.blk.17.attn_k.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.17.attn_v.q_input3Arscale = util.global.load @"__auto.blk.17.attn_v.q_input:rscale" : tensor<f32> | |
%416 = torch_c.from_builtin_tensor %__auto.blk.17.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.17.attn_v.weight3Aqs = util.global.load @"__auto.blk.17.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ> | |
%417 = torch_c.from_builtin_tensor %__auto.blk.17.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ> | |
%__auto.blk.17.attn_v.q_output3Arscale = util.global.load @"__auto.blk.17.attn_v.q_output:rscale" : tensor<f32> | |
%418 = torch_c.from_builtin_tensor %__auto.blk.17.attn_v.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%419 = torch.vtensor.literal(dense<0.0883883461> : tensor<f32>) : !torch.vtensor<[],f32> | |
%__auto.blk.17.attn_output.q_input3Arscale = util.global.load @"__auto.blk.17.attn_output.q_input:rscale" : tensor<f32> | |
%420 = torch_c.from_builtin_tensor %__auto.blk.17.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.17.attn_output.weight3Aqs = util.global.load @"__auto.blk.17.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ> | |
%421 = torch_c.from_builtin_tensor %__auto.blk.17.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ> | |
%__auto.blk.17.attn_output.weight3Ad = util.global.load @"__auto.blk.17.attn_output.weight:d" : tensor<f32> | |
%422 = torch_c.from_builtin_tensor %__auto.blk.17.attn_output.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.17.ffn_norm.weight = util.global.load @__auto.blk.17.ffn_norm.weight : tensor<4096xbf16> | |
%423 = torch_c.from_builtin_tensor %__auto.blk.17.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16> | |
%__auto.blk.17.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.17.ffn_gate.q_input:rscale" : tensor<f32> | |
%424 = torch_c.from_builtin_tensor %__auto.blk.17.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.17.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.17.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ> | |
%425 = torch_c.from_builtin_tensor %__auto.blk.17.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ> | |
%__auto.blk.17.ffn_gate.weight3Ad = util.global.load @"__auto.blk.17.ffn_gate.weight:d" : tensor<f32> | |
%426 = torch_c.from_builtin_tensor %__auto.blk.17.ffn_gate.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.17.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.17.ffn_up.q_input:rscale" : tensor<f32> | |
%427 = torch_c.from_builtin_tensor %__auto.blk.17.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.17.ffn_up.weight3Aqs = util.global.load @"__auto.blk.17.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ> | |
%428 = torch_c.from_builtin_tensor %__auto.blk.17.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ> | |
%__auto.blk.17.ffn_up.weight3Ad = util.global.load @"__auto.blk.17.ffn_up.weight:d" : tensor<f32> | |
%429 = torch_c.from_builtin_tensor %__auto.blk.17.ffn_up.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.17.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.17.ffn_down.q_input:rscale" : tensor<f32> | |
%430 = torch_c.from_builtin_tensor %__auto.blk.17.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.17.ffn_down.weight3Aqs = util.global.load @"__auto.blk.17.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ> | |
%431 = torch_c.from_builtin_tensor %__auto.blk.17.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ> | |
%__auto.blk.17.ffn_down.weight3Ad = util.global.load @"__auto.blk.17.ffn_down.weight:d" : tensor<f32> | |
%432 = torch_c.from_builtin_tensor %__auto.blk.17.ffn_down.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.18.attn_norm.weight = util.global.load @__auto.blk.18.attn_norm.weight : tensor<4096xbf16> | |
%433 = torch_c.from_builtin_tensor %__auto.blk.18.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16> | |
%__auto.blk.18.attn_q.q_input3Arscale = util.global.load @"__auto.blk.18.attn_q.q_input:rscale" : tensor<f32> | |
%434 = torch_c.from_builtin_tensor %__auto.blk.18.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.18.attn_q.weight3Aqs = util.global.load @"__auto.blk.18.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ> | |
%435 = torch_c.from_builtin_tensor %__auto.blk.18.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ> | |
%__auto.blk.18.attn_q.q_output3Arscale = util.global.load @"__auto.blk.18.attn_q.q_output:rscale" : tensor<f32> | |
%436 = torch_c.from_builtin_tensor %__auto.blk.18.attn_q.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.18.attn_k.q_input3Arscale = util.global.load @"__auto.blk.18.attn_k.q_input:rscale" : tensor<f32> | |
%437 = torch_c.from_builtin_tensor %__auto.blk.18.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.18.attn_k.weight3Aqs = util.global.load @"__auto.blk.18.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ> | |
%438 = torch_c.from_builtin_tensor %__auto.blk.18.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ> | |
%__auto.blk.18.attn_k.q_output3Arscale = util.global.load @"__auto.blk.18.attn_k.q_output:rscale" : tensor<f32> | |
%439 = torch_c.from_builtin_tensor %__auto.blk.18.attn_k.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.18.attn_v.q_input3Arscale = util.global.load @"__auto.blk.18.attn_v.q_input:rscale" : tensor<f32> | |
%440 = torch_c.from_builtin_tensor %__auto.blk.18.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.18.attn_v.weight3Aqs = util.global.load @"__auto.blk.18.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ> | |
%441 = torch_c.from_builtin_tensor %__auto.blk.18.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ> | |
%__auto.blk.18.attn_v.q_output3Arscale = util.global.load @"__auto.blk.18.attn_v.q_output:rscale" : tensor<f32> | |
%442 = torch_c.from_builtin_tensor %__auto.blk.18.attn_v.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%443 = torch.vtensor.literal(dense<0.0883883461> : tensor<f32>) : !torch.vtensor<[],f32> | |
%__auto.blk.18.attn_output.q_input3Arscale = util.global.load @"__auto.blk.18.attn_output.q_input:rscale" : tensor<f32> | |
%444 = torch_c.from_builtin_tensor %__auto.blk.18.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.18.attn_output.weight3Aqs = util.global.load @"__auto.blk.18.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ> | |
%445 = torch_c.from_builtin_tensor %__auto.blk.18.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ> | |
%__auto.blk.18.attn_output.weight3Ad = util.global.load @"__auto.blk.18.attn_output.weight:d" : tensor<f32> | |
%446 = torch_c.from_builtin_tensor %__auto.blk.18.attn_output.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.18.ffn_norm.weight = util.global.load @__auto.blk.18.ffn_norm.weight : tensor<4096xbf16> | |
%447 = torch_c.from_builtin_tensor %__auto.blk.18.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16> | |
%__auto.blk.18.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.18.ffn_gate.q_input:rscale" : tensor<f32> | |
%448 = torch_c.from_builtin_tensor %__auto.blk.18.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.18.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.18.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ> | |
%449 = torch_c.from_builtin_tensor %__auto.blk.18.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ> | |
%__auto.blk.18.ffn_gate.weight3Ad = util.global.load @"__auto.blk.18.ffn_gate.weight:d" : tensor<f32> | |
%450 = torch_c.from_builtin_tensor %__auto.blk.18.ffn_gate.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.18.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.18.ffn_up.q_input:rscale" : tensor<f32> | |
%451 = torch_c.from_builtin_tensor %__auto.blk.18.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.18.ffn_up.weight3Aqs = util.global.load @"__auto.blk.18.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ> | |
%452 = torch_c.from_builtin_tensor %__auto.blk.18.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ> | |
%__auto.blk.18.ffn_up.weight3Ad = util.global.load @"__auto.blk.18.ffn_up.weight:d" : tensor<f32> | |
%453 = torch_c.from_builtin_tensor %__auto.blk.18.ffn_up.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.18.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.18.ffn_down.q_input:rscale" : tensor<f32> | |
%454 = torch_c.from_builtin_tensor %__auto.blk.18.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.18.ffn_down.weight3Aqs = util.global.load @"__auto.blk.18.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ> | |
%455 = torch_c.from_builtin_tensor %__auto.blk.18.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ> | |
%__auto.blk.18.ffn_down.weight3Ad = util.global.load @"__auto.blk.18.ffn_down.weight:d" : tensor<f32> | |
%456 = torch_c.from_builtin_tensor %__auto.blk.18.ffn_down.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.19.attn_norm.weight = util.global.load @__auto.blk.19.attn_norm.weight : tensor<4096xbf16> | |
%457 = torch_c.from_builtin_tensor %__auto.blk.19.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16> | |
%__auto.blk.19.attn_q.q_input3Arscale = util.global.load @"__auto.blk.19.attn_q.q_input:rscale" : tensor<f32> | |
%458 = torch_c.from_builtin_tensor %__auto.blk.19.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.19.attn_q.weight3Aqs = util.global.load @"__auto.blk.19.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ> | |
%459 = torch_c.from_builtin_tensor %__auto.blk.19.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ> | |
%__auto.blk.19.attn_q.q_output3Arscale = util.global.load @"__auto.blk.19.attn_q.q_output:rscale" : tensor<f32> | |
%460 = torch_c.from_builtin_tensor %__auto.blk.19.attn_q.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.19.attn_k.q_input3Arscale = util.global.load @"__auto.blk.19.attn_k.q_input:rscale" : tensor<f32> | |
%461 = torch_c.from_builtin_tensor %__auto.blk.19.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.19.attn_k.weight3Aqs = util.global.load @"__auto.blk.19.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ> | |
%462 = torch_c.from_builtin_tensor %__auto.blk.19.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ> | |
%__auto.blk.19.attn_k.q_output3Arscale = util.global.load @"__auto.blk.19.attn_k.q_output:rscale" : tensor<f32> | |
%463 = torch_c.from_builtin_tensor %__auto.blk.19.attn_k.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.19.attn_v.q_input3Arscale = util.global.load @"__auto.blk.19.attn_v.q_input:rscale" : tensor<f32> | |
%464 = torch_c.from_builtin_tensor %__auto.blk.19.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.19.attn_v.weight3Aqs = util.global.load @"__auto.blk.19.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ> | |
%465 = torch_c.from_builtin_tensor %__auto.blk.19.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ> | |
%__auto.blk.19.attn_v.q_output3Arscale = util.global.load @"__auto.blk.19.attn_v.q_output:rscale" : tensor<f32> | |
%466 = torch_c.from_builtin_tensor %__auto.blk.19.attn_v.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%467 = torch.vtensor.literal(dense<0.0883883461> : tensor<f32>) : !torch.vtensor<[],f32> | |
%__auto.blk.19.attn_output.q_input3Arscale = util.global.load @"__auto.blk.19.attn_output.q_input:rscale" : tensor<f32> | |
%468 = torch_c.from_builtin_tensor %__auto.blk.19.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.19.attn_output.weight3Aqs = util.global.load @"__auto.blk.19.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ> | |
%469 = torch_c.from_builtin_tensor %__auto.blk.19.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ> | |
%__auto.blk.19.attn_output.weight3Ad = util.global.load @"__auto.blk.19.attn_output.weight:d" : tensor<f32> | |
%470 = torch_c.from_builtin_tensor %__auto.blk.19.attn_output.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.19.ffn_norm.weight = util.global.load @__auto.blk.19.ffn_norm.weight : tensor<4096xbf16> | |
%471 = torch_c.from_builtin_tensor %__auto.blk.19.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16> | |
%__auto.blk.19.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.19.ffn_gate.q_input:rscale" : tensor<f32> | |
%472 = torch_c.from_builtin_tensor %__auto.blk.19.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.19.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.19.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ> | |
%473 = torch_c.from_builtin_tensor %__auto.blk.19.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ> | |
%__auto.blk.19.ffn_gate.weight3Ad = util.global.load @"__auto.blk.19.ffn_gate.weight:d" : tensor<f32> | |
%474 = torch_c.from_builtin_tensor %__auto.blk.19.ffn_gate.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.19.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.19.ffn_up.q_input:rscale" : tensor<f32> | |
%475 = torch_c.from_builtin_tensor %__auto.blk.19.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.19.ffn_up.weight3Aqs = util.global.load @"__auto.blk.19.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ> | |
%476 = torch_c.from_builtin_tensor %__auto.blk.19.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ> | |
%__auto.blk.19.ffn_up.weight3Ad = util.global.load @"__auto.blk.19.ffn_up.weight:d" : tensor<f32> | |
%477 = torch_c.from_builtin_tensor %__auto.blk.19.ffn_up.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.19.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.19.ffn_down.q_input:rscale" : tensor<f32> | |
%478 = torch_c.from_builtin_tensor %__auto.blk.19.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.19.ffn_down.weight3Aqs = util.global.load @"__auto.blk.19.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ> | |
%479 = torch_c.from_builtin_tensor %__auto.blk.19.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ> | |
%__auto.blk.19.ffn_down.weight3Ad = util.global.load @"__auto.blk.19.ffn_down.weight:d" : tensor<f32> | |
%480 = torch_c.from_builtin_tensor %__auto.blk.19.ffn_down.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.20.attn_norm.weight = util.global.load @__auto.blk.20.attn_norm.weight : tensor<4096xbf16> | |
%481 = torch_c.from_builtin_tensor %__auto.blk.20.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16> | |
%__auto.blk.20.attn_q.q_input3Arscale = util.global.load @"__auto.blk.20.attn_q.q_input:rscale" : tensor<f32> | |
%482 = torch_c.from_builtin_tensor %__auto.blk.20.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.20.attn_q.weight3Aqs = util.global.load @"__auto.blk.20.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ> | |
%483 = torch_c.from_builtin_tensor %__auto.blk.20.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ> | |
%__auto.blk.20.attn_q.q_output3Arscale = util.global.load @"__auto.blk.20.attn_q.q_output:rscale" : tensor<f32> | |
%484 = torch_c.from_builtin_tensor %__auto.blk.20.attn_q.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.20.attn_k.q_input3Arscale = util.global.load @"__auto.blk.20.attn_k.q_input:rscale" : tensor<f32> | |
%485 = torch_c.from_builtin_tensor %__auto.blk.20.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.20.attn_k.weight3Aqs = util.global.load @"__auto.blk.20.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ> | |
%486 = torch_c.from_builtin_tensor %__auto.blk.20.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ> | |
%__auto.blk.20.attn_k.q_output3Arscale = util.global.load @"__auto.blk.20.attn_k.q_output:rscale" : tensor<f32> | |
%487 = torch_c.from_builtin_tensor %__auto.blk.20.attn_k.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.20.attn_v.q_input3Arscale = util.global.load @"__auto.blk.20.attn_v.q_input:rscale" : tensor<f32> | |
%488 = torch_c.from_builtin_tensor %__auto.blk.20.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.20.attn_v.weight3Aqs = util.global.load @"__auto.blk.20.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ> | |
%489 = torch_c.from_builtin_tensor %__auto.blk.20.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ> | |
%__auto.blk.20.attn_v.q_output3Arscale = util.global.load @"__auto.blk.20.attn_v.q_output:rscale" : tensor<f32> | |
%490 = torch_c.from_builtin_tensor %__auto.blk.20.attn_v.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%491 = torch.vtensor.literal(dense<0.0883883461> : tensor<f32>) : !torch.vtensor<[],f32> | |
%__auto.blk.20.attn_output.q_input3Arscale = util.global.load @"__auto.blk.20.attn_output.q_input:rscale" : tensor<f32> | |
%492 = torch_c.from_builtin_tensor %__auto.blk.20.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.20.attn_output.weight3Aqs = util.global.load @"__auto.blk.20.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ> | |
%493 = torch_c.from_builtin_tensor %__auto.blk.20.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ> | |
%__auto.blk.20.attn_output.weight3Ad = util.global.load @"__auto.blk.20.attn_output.weight:d" : tensor<f32> | |
%494 = torch_c.from_builtin_tensor %__auto.blk.20.attn_output.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.20.ffn_norm.weight = util.global.load @__auto.blk.20.ffn_norm.weight : tensor<4096xbf16> | |
%495 = torch_c.from_builtin_tensor %__auto.blk.20.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16> | |
%__auto.blk.20.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.20.ffn_gate.q_input:rscale" : tensor<f32> | |
%496 = torch_c.from_builtin_tensor %__auto.blk.20.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.20.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.20.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ> | |
%497 = torch_c.from_builtin_tensor %__auto.blk.20.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ> | |
%__auto.blk.20.ffn_gate.weight3Ad = util.global.load @"__auto.blk.20.ffn_gate.weight:d" : tensor<f32> | |
%498 = torch_c.from_builtin_tensor %__auto.blk.20.ffn_gate.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.20.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.20.ffn_up.q_input:rscale" : tensor<f32> | |
%499 = torch_c.from_builtin_tensor %__auto.blk.20.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.20.ffn_up.weight3Aqs = util.global.load @"__auto.blk.20.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ> | |
%500 = torch_c.from_builtin_tensor %__auto.blk.20.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ> | |
%__auto.blk.20.ffn_up.weight3Ad = util.global.load @"__auto.blk.20.ffn_up.weight:d" : tensor<f32> | |
%501 = torch_c.from_builtin_tensor %__auto.blk.20.ffn_up.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.20.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.20.ffn_down.q_input:rscale" : tensor<f32> | |
%502 = torch_c.from_builtin_tensor %__auto.blk.20.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.20.ffn_down.weight3Aqs = util.global.load @"__auto.blk.20.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ> | |
%503 = torch_c.from_builtin_tensor %__auto.blk.20.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ> | |
%__auto.blk.20.ffn_down.weight3Ad = util.global.load @"__auto.blk.20.ffn_down.weight:d" : tensor<f32> | |
%504 = torch_c.from_builtin_tensor %__auto.blk.20.ffn_down.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.21.attn_norm.weight = util.global.load @__auto.blk.21.attn_norm.weight : tensor<4096xbf16> | |
%505 = torch_c.from_builtin_tensor %__auto.blk.21.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16> | |
%__auto.blk.21.attn_q.q_input3Arscale = util.global.load @"__auto.blk.21.attn_q.q_input:rscale" : tensor<f32> | |
%506 = torch_c.from_builtin_tensor %__auto.blk.21.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.21.attn_q.weight3Aqs = util.global.load @"__auto.blk.21.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ> | |
%507 = torch_c.from_builtin_tensor %__auto.blk.21.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ> | |
%__auto.blk.21.attn_q.q_output3Arscale = util.global.load @"__auto.blk.21.attn_q.q_output:rscale" : tensor<f32> | |
%508 = torch_c.from_builtin_tensor %__auto.blk.21.attn_q.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.21.attn_k.q_input3Arscale = util.global.load @"__auto.blk.21.attn_k.q_input:rscale" : tensor<f32> | |
%509 = torch_c.from_builtin_tensor %__auto.blk.21.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.21.attn_k.weight3Aqs = util.global.load @"__auto.blk.21.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ> | |
%510 = torch_c.from_builtin_tensor %__auto.blk.21.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ> | |
%__auto.blk.21.attn_k.q_output3Arscale = util.global.load @"__auto.blk.21.attn_k.q_output:rscale" : tensor<f32> | |
%511 = torch_c.from_builtin_tensor %__auto.blk.21.attn_k.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.21.attn_v.q_input3Arscale = util.global.load @"__auto.blk.21.attn_v.q_input:rscale" : tensor<f32> | |
%512 = torch_c.from_builtin_tensor %__auto.blk.21.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.21.attn_v.weight3Aqs = util.global.load @"__auto.blk.21.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ> | |
%513 = torch_c.from_builtin_tensor %__auto.blk.21.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ> | |
%__auto.blk.21.attn_v.q_output3Arscale = util.global.load @"__auto.blk.21.attn_v.q_output:rscale" : tensor<f32> | |
%514 = torch_c.from_builtin_tensor %__auto.blk.21.attn_v.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%515 = torch.vtensor.literal(dense<0.0883883461> : tensor<f32>) : !torch.vtensor<[],f32> | |
%__auto.blk.21.attn_output.q_input3Arscale = util.global.load @"__auto.blk.21.attn_output.q_input:rscale" : tensor<f32> | |
%516 = torch_c.from_builtin_tensor %__auto.blk.21.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.21.attn_output.weight3Aqs = util.global.load @"__auto.blk.21.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ> | |
%517 = torch_c.from_builtin_tensor %__auto.blk.21.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ> | |
%__auto.blk.21.attn_output.weight3Ad = util.global.load @"__auto.blk.21.attn_output.weight:d" : tensor<f32> | |
%518 = torch_c.from_builtin_tensor %__auto.blk.21.attn_output.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.21.ffn_norm.weight = util.global.load @__auto.blk.21.ffn_norm.weight : tensor<4096xbf16> | |
%519 = torch_c.from_builtin_tensor %__auto.blk.21.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16> | |
%__auto.blk.21.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.21.ffn_gate.q_input:rscale" : tensor<f32> | |
%520 = torch_c.from_builtin_tensor %__auto.blk.21.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.21.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.21.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ> | |
%521 = torch_c.from_builtin_tensor %__auto.blk.21.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ> | |
%__auto.blk.21.ffn_gate.weight3Ad = util.global.load @"__auto.blk.21.ffn_gate.weight:d" : tensor<f32> | |
%522 = torch_c.from_builtin_tensor %__auto.blk.21.ffn_gate.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.21.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.21.ffn_up.q_input:rscale" : tensor<f32> | |
%523 = torch_c.from_builtin_tensor %__auto.blk.21.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.21.ffn_up.weight3Aqs = util.global.load @"__auto.blk.21.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ> | |
%524 = torch_c.from_builtin_tensor %__auto.blk.21.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ> | |
%__auto.blk.21.ffn_up.weight3Ad = util.global.load @"__auto.blk.21.ffn_up.weight:d" : tensor<f32> | |
%525 = torch_c.from_builtin_tensor %__auto.blk.21.ffn_up.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.21.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.21.ffn_down.q_input:rscale" : tensor<f32> | |
%526 = torch_c.from_builtin_tensor %__auto.blk.21.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.21.ffn_down.weight3Aqs = util.global.load @"__auto.blk.21.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ> | |
%527 = torch_c.from_builtin_tensor %__auto.blk.21.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ> | |
%__auto.blk.21.ffn_down.weight3Ad = util.global.load @"__auto.blk.21.ffn_down.weight:d" : tensor<f32> | |
%528 = torch_c.from_builtin_tensor %__auto.blk.21.ffn_down.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.22.attn_norm.weight = util.global.load @__auto.blk.22.attn_norm.weight : tensor<4096xbf16> | |
%529 = torch_c.from_builtin_tensor %__auto.blk.22.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16> | |
%__auto.blk.22.attn_q.q_input3Arscale = util.global.load @"__auto.blk.22.attn_q.q_input:rscale" : tensor<f32> | |
%530 = torch_c.from_builtin_tensor %__auto.blk.22.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.22.attn_q.weight3Aqs = util.global.load @"__auto.blk.22.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ> | |
%531 = torch_c.from_builtin_tensor %__auto.blk.22.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ> | |
%__auto.blk.22.attn_q.q_output3Arscale = util.global.load @"__auto.blk.22.attn_q.q_output:rscale" : tensor<f32> | |
%532 = torch_c.from_builtin_tensor %__auto.blk.22.attn_q.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.22.attn_k.q_input3Arscale = util.global.load @"__auto.blk.22.attn_k.q_input:rscale" : tensor<f32> | |
%533 = torch_c.from_builtin_tensor %__auto.blk.22.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.22.attn_k.weight3Aqs = util.global.load @"__auto.blk.22.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ> | |
%534 = torch_c.from_builtin_tensor %__auto.blk.22.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ> | |
%__auto.blk.22.attn_k.q_output3Arscale = util.global.load @"__auto.blk.22.attn_k.q_output:rscale" : tensor<f32> | |
%535 = torch_c.from_builtin_tensor %__auto.blk.22.attn_k.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.22.attn_v.q_input3Arscale = util.global.load @"__auto.blk.22.attn_v.q_input:rscale" : tensor<f32> | |
%536 = torch_c.from_builtin_tensor %__auto.blk.22.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.22.attn_v.weight3Aqs = util.global.load @"__auto.blk.22.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ> | |
%537 = torch_c.from_builtin_tensor %__auto.blk.22.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ> | |
%__auto.blk.22.attn_v.q_output3Arscale = util.global.load @"__auto.blk.22.attn_v.q_output:rscale" : tensor<f32> | |
%538 = torch_c.from_builtin_tensor %__auto.blk.22.attn_v.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%539 = torch.vtensor.literal(dense<0.0883883461> : tensor<f32>) : !torch.vtensor<[],f32> | |
%__auto.blk.22.attn_output.q_input3Arscale = util.global.load @"__auto.blk.22.attn_output.q_input:rscale" : tensor<f32> | |
%540 = torch_c.from_builtin_tensor %__auto.blk.22.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.22.attn_output.weight3Aqs = util.global.load @"__auto.blk.22.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ> | |
%541 = torch_c.from_builtin_tensor %__auto.blk.22.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ> | |
%__auto.blk.22.attn_output.weight3Ad = util.global.load @"__auto.blk.22.attn_output.weight:d" : tensor<f32> | |
%542 = torch_c.from_builtin_tensor %__auto.blk.22.attn_output.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.22.ffn_norm.weight = util.global.load @__auto.blk.22.ffn_norm.weight : tensor<4096xbf16> | |
%543 = torch_c.from_builtin_tensor %__auto.blk.22.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16> | |
%__auto.blk.22.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.22.ffn_gate.q_input:rscale" : tensor<f32> | |
%544 = torch_c.from_builtin_tensor %__auto.blk.22.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.22.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.22.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ> | |
%545 = torch_c.from_builtin_tensor %__auto.blk.22.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ> | |
%__auto.blk.22.ffn_gate.weight3Ad = util.global.load @"__auto.blk.22.ffn_gate.weight:d" : tensor<f32> | |
%546 = torch_c.from_builtin_tensor %__auto.blk.22.ffn_gate.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.22.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.22.ffn_up.q_input:rscale" : tensor<f32> | |
%547 = torch_c.from_builtin_tensor %__auto.blk.22.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.22.ffn_up.weight3Aqs = util.global.load @"__auto.blk.22.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ> | |
%548 = torch_c.from_builtin_tensor %__auto.blk.22.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ> | |
%__auto.blk.22.ffn_up.weight3Ad = util.global.load @"__auto.blk.22.ffn_up.weight:d" : tensor<f32> | |
%549 = torch_c.from_builtin_tensor %__auto.blk.22.ffn_up.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.22.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.22.ffn_down.q_input:rscale" : tensor<f32> | |
%550 = torch_c.from_builtin_tensor %__auto.blk.22.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.22.ffn_down.weight3Aqs = util.global.load @"__auto.blk.22.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ> | |
%551 = torch_c.from_builtin_tensor %__auto.blk.22.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ> | |
%__auto.blk.22.ffn_down.weight3Ad = util.global.load @"__auto.blk.22.ffn_down.weight:d" : tensor<f32> | |
%552 = torch_c.from_builtin_tensor %__auto.blk.22.ffn_down.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.23.attn_norm.weight = util.global.load @__auto.blk.23.attn_norm.weight : tensor<4096xbf16> | |
%553 = torch_c.from_builtin_tensor %__auto.blk.23.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16> | |
%__auto.blk.23.attn_q.q_input3Arscale = util.global.load @"__auto.blk.23.attn_q.q_input:rscale" : tensor<f32> | |
%554 = torch_c.from_builtin_tensor %__auto.blk.23.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.23.attn_q.weight3Aqs = util.global.load @"__auto.blk.23.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ> | |
%555 = torch_c.from_builtin_tensor %__auto.blk.23.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ> | |
%__auto.blk.23.attn_q.q_output3Arscale = util.global.load @"__auto.blk.23.attn_q.q_output:rscale" : tensor<f32> | |
%556 = torch_c.from_builtin_tensor %__auto.blk.23.attn_q.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.23.attn_k.q_input3Arscale = util.global.load @"__auto.blk.23.attn_k.q_input:rscale" : tensor<f32> | |
%557 = torch_c.from_builtin_tensor %__auto.blk.23.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.23.attn_k.weight3Aqs = util.global.load @"__auto.blk.23.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ> | |
%558 = torch_c.from_builtin_tensor %__auto.blk.23.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ> | |
%__auto.blk.23.attn_k.q_output3Arscale = util.global.load @"__auto.blk.23.attn_k.q_output:rscale" : tensor<f32> | |
%559 = torch_c.from_builtin_tensor %__auto.blk.23.attn_k.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.23.attn_v.q_input3Arscale = util.global.load @"__auto.blk.23.attn_v.q_input:rscale" : tensor<f32> | |
%560 = torch_c.from_builtin_tensor %__auto.blk.23.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.23.attn_v.weight3Aqs = util.global.load @"__auto.blk.23.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ> | |
%561 = torch_c.from_builtin_tensor %__auto.blk.23.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ> | |
%__auto.blk.23.attn_v.q_output3Arscale = util.global.load @"__auto.blk.23.attn_v.q_output:rscale" : tensor<f32> | |
%562 = torch_c.from_builtin_tensor %__auto.blk.23.attn_v.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%563 = torch.vtensor.literal(dense<0.0883883461> : tensor<f32>) : !torch.vtensor<[],f32> | |
%__auto.blk.23.attn_output.q_input3Arscale = util.global.load @"__auto.blk.23.attn_output.q_input:rscale" : tensor<f32> | |
%564 = torch_c.from_builtin_tensor %__auto.blk.23.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.23.attn_output.weight3Aqs = util.global.load @"__auto.blk.23.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ> | |
%565 = torch_c.from_builtin_tensor %__auto.blk.23.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ> | |
%__auto.blk.23.attn_output.weight3Ad = util.global.load @"__auto.blk.23.attn_output.weight:d" : tensor<f32> | |
%566 = torch_c.from_builtin_tensor %__auto.blk.23.attn_output.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.23.ffn_norm.weight = util.global.load @__auto.blk.23.ffn_norm.weight : tensor<4096xbf16> | |
%567 = torch_c.from_builtin_tensor %__auto.blk.23.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16> | |
%__auto.blk.23.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.23.ffn_gate.q_input:rscale" : tensor<f32> | |
%568 = torch_c.from_builtin_tensor %__auto.blk.23.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.23.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.23.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ> | |
%569 = torch_c.from_builtin_tensor %__auto.blk.23.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ> | |
%__auto.blk.23.ffn_gate.weight3Ad = util.global.load @"__auto.blk.23.ffn_gate.weight:d" : tensor<f32> | |
%570 = torch_c.from_builtin_tensor %__auto.blk.23.ffn_gate.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.23.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.23.ffn_up.q_input:rscale" : tensor<f32> | |
%571 = torch_c.from_builtin_tensor %__auto.blk.23.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.23.ffn_up.weight3Aqs = util.global.load @"__auto.blk.23.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ> | |
%572 = torch_c.from_builtin_tensor %__auto.blk.23.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ> | |
%__auto.blk.23.ffn_up.weight3Ad = util.global.load @"__auto.blk.23.ffn_up.weight:d" : tensor<f32> | |
%573 = torch_c.from_builtin_tensor %__auto.blk.23.ffn_up.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.23.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.23.ffn_down.q_input:rscale" : tensor<f32> | |
%574 = torch_c.from_builtin_tensor %__auto.blk.23.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.23.ffn_down.weight3Aqs = util.global.load @"__auto.blk.23.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ> | |
%575 = torch_c.from_builtin_tensor %__auto.blk.23.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ> | |
%__auto.blk.23.ffn_down.weight3Ad = util.global.load @"__auto.blk.23.ffn_down.weight:d" : tensor<f32> | |
%576 = torch_c.from_builtin_tensor %__auto.blk.23.ffn_down.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.24.attn_norm.weight = util.global.load @__auto.blk.24.attn_norm.weight : tensor<4096xbf16> | |
%577 = torch_c.from_builtin_tensor %__auto.blk.24.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16> | |
%__auto.blk.24.attn_q.q_input3Arscale = util.global.load @"__auto.blk.24.attn_q.q_input:rscale" : tensor<f32> | |
%578 = torch_c.from_builtin_tensor %__auto.blk.24.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.24.attn_q.weight3Aqs = util.global.load @"__auto.blk.24.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ> | |
%579 = torch_c.from_builtin_tensor %__auto.blk.24.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ> | |
%__auto.blk.24.attn_q.q_output3Arscale = util.global.load @"__auto.blk.24.attn_q.q_output:rscale" : tensor<f32> | |
%580 = torch_c.from_builtin_tensor %__auto.blk.24.attn_q.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.24.attn_k.q_input3Arscale = util.global.load @"__auto.blk.24.attn_k.q_input:rscale" : tensor<f32> | |
%581 = torch_c.from_builtin_tensor %__auto.blk.24.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.24.attn_k.weight3Aqs = util.global.load @"__auto.blk.24.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ> | |
%582 = torch_c.from_builtin_tensor %__auto.blk.24.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ> | |
%__auto.blk.24.attn_k.q_output3Arscale = util.global.load @"__auto.blk.24.attn_k.q_output:rscale" : tensor<f32> | |
%583 = torch_c.from_builtin_tensor %__auto.blk.24.attn_k.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.24.attn_v.q_input3Arscale = util.global.load @"__auto.blk.24.attn_v.q_input:rscale" : tensor<f32> | |
%584 = torch_c.from_builtin_tensor %__auto.blk.24.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.24.attn_v.weight3Aqs = util.global.load @"__auto.blk.24.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ> | |
%585 = torch_c.from_builtin_tensor %__auto.blk.24.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ> | |
%__auto.blk.24.attn_v.q_output3Arscale = util.global.load @"__auto.blk.24.attn_v.q_output:rscale" : tensor<f32> | |
%586 = torch_c.from_builtin_tensor %__auto.blk.24.attn_v.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%587 = torch.vtensor.literal(dense<0.0883883461> : tensor<f32>) : !torch.vtensor<[],f32> | |
%__auto.blk.24.attn_output.q_input3Arscale = util.global.load @"__auto.blk.24.attn_output.q_input:rscale" : tensor<f32> | |
%588 = torch_c.from_builtin_tensor %__auto.blk.24.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.24.attn_output.weight3Aqs = util.global.load @"__auto.blk.24.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ> | |
%589 = torch_c.from_builtin_tensor %__auto.blk.24.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ> | |
%__auto.blk.24.attn_output.weight3Ad = util.global.load @"__auto.blk.24.attn_output.weight:d" : tensor<f32> | |
%590 = torch_c.from_builtin_tensor %__auto.blk.24.attn_output.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.24.ffn_norm.weight = util.global.load @__auto.blk.24.ffn_norm.weight : tensor<4096xbf16> | |
%591 = torch_c.from_builtin_tensor %__auto.blk.24.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16> | |
%__auto.blk.24.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.24.ffn_gate.q_input:rscale" : tensor<f32> | |
%592 = torch_c.from_builtin_tensor %__auto.blk.24.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.24.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.24.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ> | |
%593 = torch_c.from_builtin_tensor %__auto.blk.24.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ> | |
%__auto.blk.24.ffn_gate.weight3Ad = util.global.load @"__auto.blk.24.ffn_gate.weight:d" : tensor<f32> | |
%594 = torch_c.from_builtin_tensor %__auto.blk.24.ffn_gate.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.24.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.24.ffn_up.q_input:rscale" : tensor<f32> | |
%595 = torch_c.from_builtin_tensor %__auto.blk.24.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.24.ffn_up.weight3Aqs = util.global.load @"__auto.blk.24.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ> | |
%596 = torch_c.from_builtin_tensor %__auto.blk.24.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ> | |
%__auto.blk.24.ffn_up.weight3Ad = util.global.load @"__auto.blk.24.ffn_up.weight:d" : tensor<f32> | |
%597 = torch_c.from_builtin_tensor %__auto.blk.24.ffn_up.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.24.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.24.ffn_down.q_input:rscale" : tensor<f32> | |
%598 = torch_c.from_builtin_tensor %__auto.blk.24.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.24.ffn_down.weight3Aqs = util.global.load @"__auto.blk.24.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ> | |
%599 = torch_c.from_builtin_tensor %__auto.blk.24.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ> | |
%__auto.blk.24.ffn_down.weight3Ad = util.global.load @"__auto.blk.24.ffn_down.weight:d" : tensor<f32> | |
%600 = torch_c.from_builtin_tensor %__auto.blk.24.ffn_down.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.25.attn_norm.weight = util.global.load @__auto.blk.25.attn_norm.weight : tensor<4096xbf16> | |
%601 = torch_c.from_builtin_tensor %__auto.blk.25.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16> | |
%__auto.blk.25.attn_q.q_input3Arscale = util.global.load @"__auto.blk.25.attn_q.q_input:rscale" : tensor<f32> | |
%602 = torch_c.from_builtin_tensor %__auto.blk.25.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.25.attn_q.weight3Aqs = util.global.load @"__auto.blk.25.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ> | |
%603 = torch_c.from_builtin_tensor %__auto.blk.25.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ> | |
%__auto.blk.25.attn_q.q_output3Arscale = util.global.load @"__auto.blk.25.attn_q.q_output:rscale" : tensor<f32> | |
%604 = torch_c.from_builtin_tensor %__auto.blk.25.attn_q.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.25.attn_k.q_input3Arscale = util.global.load @"__auto.blk.25.attn_k.q_input:rscale" : tensor<f32> | |
%605 = torch_c.from_builtin_tensor %__auto.blk.25.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.25.attn_k.weight3Aqs = util.global.load @"__auto.blk.25.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ> | |
%606 = torch_c.from_builtin_tensor %__auto.blk.25.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ> | |
%__auto.blk.25.attn_k.q_output3Arscale = util.global.load @"__auto.blk.25.attn_k.q_output:rscale" : tensor<f32> | |
%607 = torch_c.from_builtin_tensor %__auto.blk.25.attn_k.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.25.attn_v.q_input3Arscale = util.global.load @"__auto.blk.25.attn_v.q_input:rscale" : tensor<f32> | |
%608 = torch_c.from_builtin_tensor %__auto.blk.25.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.25.attn_v.weight3Aqs = util.global.load @"__auto.blk.25.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ> | |
%609 = torch_c.from_builtin_tensor %__auto.blk.25.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ> | |
%__auto.blk.25.attn_v.q_output3Arscale = util.global.load @"__auto.blk.25.attn_v.q_output:rscale" : tensor<f32> | |
%610 = torch_c.from_builtin_tensor %__auto.blk.25.attn_v.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%611 = torch.vtensor.literal(dense<0.0883883461> : tensor<f32>) : !torch.vtensor<[],f32> | |
%__auto.blk.25.attn_output.q_input3Arscale = util.global.load @"__auto.blk.25.attn_output.q_input:rscale" : tensor<f32> | |
%612 = torch_c.from_builtin_tensor %__auto.blk.25.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.25.attn_output.weight3Aqs = util.global.load @"__auto.blk.25.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ> | |
%613 = torch_c.from_builtin_tensor %__auto.blk.25.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ> | |
%__auto.blk.25.attn_output.weight3Ad = util.global.load @"__auto.blk.25.attn_output.weight:d" : tensor<f32> | |
%614 = torch_c.from_builtin_tensor %__auto.blk.25.attn_output.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.25.ffn_norm.weight = util.global.load @__auto.blk.25.ffn_norm.weight : tensor<4096xbf16> | |
%615 = torch_c.from_builtin_tensor %__auto.blk.25.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16> | |
%__auto.blk.25.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.25.ffn_gate.q_input:rscale" : tensor<f32> | |
%616 = torch_c.from_builtin_tensor %__auto.blk.25.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.25.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.25.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ> | |
%617 = torch_c.from_builtin_tensor %__auto.blk.25.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ> | |
%__auto.blk.25.ffn_gate.weight3Ad = util.global.load @"__auto.blk.25.ffn_gate.weight:d" : tensor<f32> | |
%618 = torch_c.from_builtin_tensor %__auto.blk.25.ffn_gate.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.25.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.25.ffn_up.q_input:rscale" : tensor<f32> | |
%619 = torch_c.from_builtin_tensor %__auto.blk.25.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.25.ffn_up.weight3Aqs = util.global.load @"__auto.blk.25.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ> | |
%620 = torch_c.from_builtin_tensor %__auto.blk.25.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ> | |
%__auto.blk.25.ffn_up.weight3Ad = util.global.load @"__auto.blk.25.ffn_up.weight:d" : tensor<f32> | |
%621 = torch_c.from_builtin_tensor %__auto.blk.25.ffn_up.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.25.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.25.ffn_down.q_input:rscale" : tensor<f32> | |
%622 = torch_c.from_builtin_tensor %__auto.blk.25.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.25.ffn_down.weight3Aqs = util.global.load @"__auto.blk.25.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ> | |
%623 = torch_c.from_builtin_tensor %__auto.blk.25.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ> | |
%__auto.blk.25.ffn_down.weight3Ad = util.global.load @"__auto.blk.25.ffn_down.weight:d" : tensor<f32> | |
%624 = torch_c.from_builtin_tensor %__auto.blk.25.ffn_down.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.26.attn_norm.weight = util.global.load @__auto.blk.26.attn_norm.weight : tensor<4096xbf16> | |
%625 = torch_c.from_builtin_tensor %__auto.blk.26.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16> | |
%__auto.blk.26.attn_q.q_input3Arscale = util.global.load @"__auto.blk.26.attn_q.q_input:rscale" : tensor<f32> | |
%626 = torch_c.from_builtin_tensor %__auto.blk.26.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.26.attn_q.weight3Aqs = util.global.load @"__auto.blk.26.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ> | |
%627 = torch_c.from_builtin_tensor %__auto.blk.26.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ> | |
%__auto.blk.26.attn_q.q_output3Arscale = util.global.load @"__auto.blk.26.attn_q.q_output:rscale" : tensor<f32> | |
%628 = torch_c.from_builtin_tensor %__auto.blk.26.attn_q.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.26.attn_k.q_input3Arscale = util.global.load @"__auto.blk.26.attn_k.q_input:rscale" : tensor<f32> | |
%629 = torch_c.from_builtin_tensor %__auto.blk.26.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.26.attn_k.weight3Aqs = util.global.load @"__auto.blk.26.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ> | |
%630 = torch_c.from_builtin_tensor %__auto.blk.26.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ> | |
%__auto.blk.26.attn_k.q_output3Arscale = util.global.load @"__auto.blk.26.attn_k.q_output:rscale" : tensor<f32> | |
%631 = torch_c.from_builtin_tensor %__auto.blk.26.attn_k.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.26.attn_v.q_input3Arscale = util.global.load @"__auto.blk.26.attn_v.q_input:rscale" : tensor<f32> | |
%632 = torch_c.from_builtin_tensor %__auto.blk.26.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.26.attn_v.weight3Aqs = util.global.load @"__auto.blk.26.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ> | |
%633 = torch_c.from_builtin_tensor %__auto.blk.26.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ> | |
%__auto.blk.26.attn_v.q_output3Arscale = util.global.load @"__auto.blk.26.attn_v.q_output:rscale" : tensor<f32> | |
%634 = torch_c.from_builtin_tensor %__auto.blk.26.attn_v.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%635 = torch.vtensor.literal(dense<0.0883883461> : tensor<f32>) : !torch.vtensor<[],f32> | |
%__auto.blk.26.attn_output.q_input3Arscale = util.global.load @"__auto.blk.26.attn_output.q_input:rscale" : tensor<f32> | |
%636 = torch_c.from_builtin_tensor %__auto.blk.26.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.26.attn_output.weight3Aqs = util.global.load @"__auto.blk.26.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ> | |
%637 = torch_c.from_builtin_tensor %__auto.blk.26.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ> | |
%__auto.blk.26.attn_output.weight3Ad = util.global.load @"__auto.blk.26.attn_output.weight:d" : tensor<f32> | |
%638 = torch_c.from_builtin_tensor %__auto.blk.26.attn_output.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.26.ffn_norm.weight = util.global.load @__auto.blk.26.ffn_norm.weight : tensor<4096xbf16> | |
%639 = torch_c.from_builtin_tensor %__auto.blk.26.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16> | |
%__auto.blk.26.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.26.ffn_gate.q_input:rscale" : tensor<f32> | |
%640 = torch_c.from_builtin_tensor %__auto.blk.26.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.26.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.26.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ> | |
%641 = torch_c.from_builtin_tensor %__auto.blk.26.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ> | |
%__auto.blk.26.ffn_gate.weight3Ad = util.global.load @"__auto.blk.26.ffn_gate.weight:d" : tensor<f32> | |
%642 = torch_c.from_builtin_tensor %__auto.blk.26.ffn_gate.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.26.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.26.ffn_up.q_input:rscale" : tensor<f32> | |
%643 = torch_c.from_builtin_tensor %__auto.blk.26.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.26.ffn_up.weight3Aqs = util.global.load @"__auto.blk.26.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ> | |
%644 = torch_c.from_builtin_tensor %__auto.blk.26.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ> | |
%__auto.blk.26.ffn_up.weight3Ad = util.global.load @"__auto.blk.26.ffn_up.weight:d" : tensor<f32> | |
%645 = torch_c.from_builtin_tensor %__auto.blk.26.ffn_up.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.26.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.26.ffn_down.q_input:rscale" : tensor<f32> | |
%646 = torch_c.from_builtin_tensor %__auto.blk.26.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.26.ffn_down.weight3Aqs = util.global.load @"__auto.blk.26.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ> | |
%647 = torch_c.from_builtin_tensor %__auto.blk.26.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ> | |
%__auto.blk.26.ffn_down.weight3Ad = util.global.load @"__auto.blk.26.ffn_down.weight:d" : tensor<f32> | |
%648 = torch_c.from_builtin_tensor %__auto.blk.26.ffn_down.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.27.attn_norm.weight = util.global.load @__auto.blk.27.attn_norm.weight : tensor<4096xbf16> | |
%649 = torch_c.from_builtin_tensor %__auto.blk.27.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16> | |
%__auto.blk.27.attn_q.q_input3Arscale = util.global.load @"__auto.blk.27.attn_q.q_input:rscale" : tensor<f32> | |
%650 = torch_c.from_builtin_tensor %__auto.blk.27.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.27.attn_q.weight3Aqs = util.global.load @"__auto.blk.27.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ> | |
%651 = torch_c.from_builtin_tensor %__auto.blk.27.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ> | |
%__auto.blk.27.attn_q.q_output3Arscale = util.global.load @"__auto.blk.27.attn_q.q_output:rscale" : tensor<f32> | |
%652 = torch_c.from_builtin_tensor %__auto.blk.27.attn_q.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.27.attn_k.q_input3Arscale = util.global.load @"__auto.blk.27.attn_k.q_input:rscale" : tensor<f32> | |
%653 = torch_c.from_builtin_tensor %__auto.blk.27.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.27.attn_k.weight3Aqs = util.global.load @"__auto.blk.27.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ> | |
%654 = torch_c.from_builtin_tensor %__auto.blk.27.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ> | |
%__auto.blk.27.attn_k.q_output3Arscale = util.global.load @"__auto.blk.27.attn_k.q_output:rscale" : tensor<f32> | |
%655 = torch_c.from_builtin_tensor %__auto.blk.27.attn_k.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.27.attn_v.q_input3Arscale = util.global.load @"__auto.blk.27.attn_v.q_input:rscale" : tensor<f32> | |
%656 = torch_c.from_builtin_tensor %__auto.blk.27.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.27.attn_v.weight3Aqs = util.global.load @"__auto.blk.27.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ> | |
%657 = torch_c.from_builtin_tensor %__auto.blk.27.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ> | |
%__auto.blk.27.attn_v.q_output3Arscale = util.global.load @"__auto.blk.27.attn_v.q_output:rscale" : tensor<f32> | |
%658 = torch_c.from_builtin_tensor %__auto.blk.27.attn_v.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%659 = torch.vtensor.literal(dense<0.0883883461> : tensor<f32>) : !torch.vtensor<[],f32> | |
%__auto.blk.27.attn_output.q_input3Arscale = util.global.load @"__auto.blk.27.attn_output.q_input:rscale" : tensor<f32> | |
%660 = torch_c.from_builtin_tensor %__auto.blk.27.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.27.attn_output.weight3Aqs = util.global.load @"__auto.blk.27.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ> | |
%661 = torch_c.from_builtin_tensor %__auto.blk.27.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ> | |
%__auto.blk.27.attn_output.weight3Ad = util.global.load @"__auto.blk.27.attn_output.weight:d" : tensor<f32> | |
%662 = torch_c.from_builtin_tensor %__auto.blk.27.attn_output.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.27.ffn_norm.weight = util.global.load @__auto.blk.27.ffn_norm.weight : tensor<4096xbf16> | |
%663 = torch_c.from_builtin_tensor %__auto.blk.27.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16> | |
%__auto.blk.27.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.27.ffn_gate.q_input:rscale" : tensor<f32> | |
%664 = torch_c.from_builtin_tensor %__auto.blk.27.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.27.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.27.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ> | |
%665 = torch_c.from_builtin_tensor %__auto.blk.27.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ> | |
%__auto.blk.27.ffn_gate.weight3Ad = util.global.load @"__auto.blk.27.ffn_gate.weight:d" : tensor<f32> | |
%666 = torch_c.from_builtin_tensor %__auto.blk.27.ffn_gate.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.27.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.27.ffn_up.q_input:rscale" : tensor<f32> | |
%667 = torch_c.from_builtin_tensor %__auto.blk.27.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.27.ffn_up.weight3Aqs = util.global.load @"__auto.blk.27.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ> | |
%668 = torch_c.from_builtin_tensor %__auto.blk.27.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ> | |
%__auto.blk.27.ffn_up.weight3Ad = util.global.load @"__auto.blk.27.ffn_up.weight:d" : tensor<f32> | |
%669 = torch_c.from_builtin_tensor %__auto.blk.27.ffn_up.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.27.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.27.ffn_down.q_input:rscale" : tensor<f32> | |
%670 = torch_c.from_builtin_tensor %__auto.blk.27.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.27.ffn_down.weight3Aqs = util.global.load @"__auto.blk.27.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ> | |
%671 = torch_c.from_builtin_tensor %__auto.blk.27.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ> | |
%__auto.blk.27.ffn_down.weight3Ad = util.global.load @"__auto.blk.27.ffn_down.weight:d" : tensor<f32> | |
%672 = torch_c.from_builtin_tensor %__auto.blk.27.ffn_down.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.28.attn_norm.weight = util.global.load @__auto.blk.28.attn_norm.weight : tensor<4096xbf16> | |
%673 = torch_c.from_builtin_tensor %__auto.blk.28.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16> | |
%__auto.blk.28.attn_q.q_input3Arscale = util.global.load @"__auto.blk.28.attn_q.q_input:rscale" : tensor<f32> | |
%674 = torch_c.from_builtin_tensor %__auto.blk.28.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.28.attn_q.weight3Aqs = util.global.load @"__auto.blk.28.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ> | |
%675 = torch_c.from_builtin_tensor %__auto.blk.28.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ> | |
%__auto.blk.28.attn_q.q_output3Arscale = util.global.load @"__auto.blk.28.attn_q.q_output:rscale" : tensor<f32> | |
%676 = torch_c.from_builtin_tensor %__auto.blk.28.attn_q.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.28.attn_k.q_input3Arscale = util.global.load @"__auto.blk.28.attn_k.q_input:rscale" : tensor<f32> | |
%677 = torch_c.from_builtin_tensor %__auto.blk.28.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.28.attn_k.weight3Aqs = util.global.load @"__auto.blk.28.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ> | |
%678 = torch_c.from_builtin_tensor %__auto.blk.28.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ> | |
%__auto.blk.28.attn_k.q_output3Arscale = util.global.load @"__auto.blk.28.attn_k.q_output:rscale" : tensor<f32> | |
%679 = torch_c.from_builtin_tensor %__auto.blk.28.attn_k.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.28.attn_v.q_input3Arscale = util.global.load @"__auto.blk.28.attn_v.q_input:rscale" : tensor<f32> | |
%680 = torch_c.from_builtin_tensor %__auto.blk.28.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.28.attn_v.weight3Aqs = util.global.load @"__auto.blk.28.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ> | |
%681 = torch_c.from_builtin_tensor %__auto.blk.28.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ> | |
%__auto.blk.28.attn_v.q_output3Arscale = util.global.load @"__auto.blk.28.attn_v.q_output:rscale" : tensor<f32> | |
%682 = torch_c.from_builtin_tensor %__auto.blk.28.attn_v.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%683 = torch.vtensor.literal(dense<0.0883883461> : tensor<f32>) : !torch.vtensor<[],f32> | |
%__auto.blk.28.attn_output.q_input3Arscale = util.global.load @"__auto.blk.28.attn_output.q_input:rscale" : tensor<f32> | |
%684 = torch_c.from_builtin_tensor %__auto.blk.28.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.28.attn_output.weight3Aqs = util.global.load @"__auto.blk.28.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ> | |
%685 = torch_c.from_builtin_tensor %__auto.blk.28.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ> | |
%__auto.blk.28.attn_output.weight3Ad = util.global.load @"__auto.blk.28.attn_output.weight:d" : tensor<f32> | |
%686 = torch_c.from_builtin_tensor %__auto.blk.28.attn_output.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.28.ffn_norm.weight = util.global.load @__auto.blk.28.ffn_norm.weight : tensor<4096xbf16> | |
%687 = torch_c.from_builtin_tensor %__auto.blk.28.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16> | |
%__auto.blk.28.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.28.ffn_gate.q_input:rscale" : tensor<f32> | |
%688 = torch_c.from_builtin_tensor %__auto.blk.28.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.28.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.28.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ> | |
%689 = torch_c.from_builtin_tensor %__auto.blk.28.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ> | |
%__auto.blk.28.ffn_gate.weight3Ad = util.global.load @"__auto.blk.28.ffn_gate.weight:d" : tensor<f32> | |
%690 = torch_c.from_builtin_tensor %__auto.blk.28.ffn_gate.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.28.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.28.ffn_up.q_input:rscale" : tensor<f32> | |
%691 = torch_c.from_builtin_tensor %__auto.blk.28.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.28.ffn_up.weight3Aqs = util.global.load @"__auto.blk.28.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ> | |
%692 = torch_c.from_builtin_tensor %__auto.blk.28.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ> | |
%__auto.blk.28.ffn_up.weight3Ad = util.global.load @"__auto.blk.28.ffn_up.weight:d" : tensor<f32> | |
%693 = torch_c.from_builtin_tensor %__auto.blk.28.ffn_up.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.28.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.28.ffn_down.q_input:rscale" : tensor<f32> | |
%694 = torch_c.from_builtin_tensor %__auto.blk.28.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.28.ffn_down.weight3Aqs = util.global.load @"__auto.blk.28.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ> | |
%695 = torch_c.from_builtin_tensor %__auto.blk.28.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ> | |
%__auto.blk.28.ffn_down.weight3Ad = util.global.load @"__auto.blk.28.ffn_down.weight:d" : tensor<f32> | |
%696 = torch_c.from_builtin_tensor %__auto.blk.28.ffn_down.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.29.attn_norm.weight = util.global.load @__auto.blk.29.attn_norm.weight : tensor<4096xbf16> | |
%697 = torch_c.from_builtin_tensor %__auto.blk.29.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16> | |
%__auto.blk.29.attn_q.q_input3Arscale = util.global.load @"__auto.blk.29.attn_q.q_input:rscale" : tensor<f32> | |
%698 = torch_c.from_builtin_tensor %__auto.blk.29.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.29.attn_q.weight3Aqs = util.global.load @"__auto.blk.29.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ> | |
%699 = torch_c.from_builtin_tensor %__auto.blk.29.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ> | |
%__auto.blk.29.attn_q.q_output3Arscale = util.global.load @"__auto.blk.29.attn_q.q_output:rscale" : tensor<f32> | |
%700 = torch_c.from_builtin_tensor %__auto.blk.29.attn_q.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.29.attn_k.q_input3Arscale = util.global.load @"__auto.blk.29.attn_k.q_input:rscale" : tensor<f32> | |
%701 = torch_c.from_builtin_tensor %__auto.blk.29.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.29.attn_k.weight3Aqs = util.global.load @"__auto.blk.29.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ> | |
%702 = torch_c.from_builtin_tensor %__auto.blk.29.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ> | |
%__auto.blk.29.attn_k.q_output3Arscale = util.global.load @"__auto.blk.29.attn_k.q_output:rscale" : tensor<f32> | |
%703 = torch_c.from_builtin_tensor %__auto.blk.29.attn_k.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.29.attn_v.q_input3Arscale = util.global.load @"__auto.blk.29.attn_v.q_input:rscale" : tensor<f32> | |
%704 = torch_c.from_builtin_tensor %__auto.blk.29.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.29.attn_v.weight3Aqs = util.global.load @"__auto.blk.29.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ> | |
%705 = torch_c.from_builtin_tensor %__auto.blk.29.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ> | |
%__auto.blk.29.attn_v.q_output3Arscale = util.global.load @"__auto.blk.29.attn_v.q_output:rscale" : tensor<f32> | |
%706 = torch_c.from_builtin_tensor %__auto.blk.29.attn_v.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%707 = torch.vtensor.literal(dense<0.0883883461> : tensor<f32>) : !torch.vtensor<[],f32> | |
%__auto.blk.29.attn_output.q_input3Arscale = util.global.load @"__auto.blk.29.attn_output.q_input:rscale" : tensor<f32> | |
%708 = torch_c.from_builtin_tensor %__auto.blk.29.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.29.attn_output.weight3Aqs = util.global.load @"__auto.blk.29.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ> | |
%709 = torch_c.from_builtin_tensor %__auto.blk.29.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ> | |
%__auto.blk.29.attn_output.weight3Ad = util.global.load @"__auto.blk.29.attn_output.weight:d" : tensor<f32> | |
%710 = torch_c.from_builtin_tensor %__auto.blk.29.attn_output.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.29.ffn_norm.weight = util.global.load @__auto.blk.29.ffn_norm.weight : tensor<4096xbf16> | |
%711 = torch_c.from_builtin_tensor %__auto.blk.29.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16> | |
%__auto.blk.29.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.29.ffn_gate.q_input:rscale" : tensor<f32> | |
%712 = torch_c.from_builtin_tensor %__auto.blk.29.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.29.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.29.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ> | |
%713 = torch_c.from_builtin_tensor %__auto.blk.29.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ> | |
%__auto.blk.29.ffn_gate.weight3Ad = util.global.load @"__auto.blk.29.ffn_gate.weight:d" : tensor<f32> | |
%714 = torch_c.from_builtin_tensor %__auto.blk.29.ffn_gate.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.29.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.29.ffn_up.q_input:rscale" : tensor<f32> | |
%715 = torch_c.from_builtin_tensor %__auto.blk.29.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.29.ffn_up.weight3Aqs = util.global.load @"__auto.blk.29.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ> | |
%716 = torch_c.from_builtin_tensor %__auto.blk.29.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ> | |
%__auto.blk.29.ffn_up.weight3Ad = util.global.load @"__auto.blk.29.ffn_up.weight:d" : tensor<f32> | |
%717 = torch_c.from_builtin_tensor %__auto.blk.29.ffn_up.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.29.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.29.ffn_down.q_input:rscale" : tensor<f32> | |
%718 = torch_c.from_builtin_tensor %__auto.blk.29.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.29.ffn_down.weight3Aqs = util.global.load @"__auto.blk.29.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ> | |
%719 = torch_c.from_builtin_tensor %__auto.blk.29.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ> | |
%__auto.blk.29.ffn_down.weight3Ad = util.global.load @"__auto.blk.29.ffn_down.weight:d" : tensor<f32> | |
%720 = torch_c.from_builtin_tensor %__auto.blk.29.ffn_down.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.30.attn_norm.weight = util.global.load @__auto.blk.30.attn_norm.weight : tensor<4096xbf16> | |
%721 = torch_c.from_builtin_tensor %__auto.blk.30.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16> | |
%__auto.blk.30.attn_q.q_input3Arscale = util.global.load @"__auto.blk.30.attn_q.q_input:rscale" : tensor<f32> | |
%722 = torch_c.from_builtin_tensor %__auto.blk.30.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.30.attn_q.weight3Aqs = util.global.load @"__auto.blk.30.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ> | |
%723 = torch_c.from_builtin_tensor %__auto.blk.30.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ> | |
%__auto.blk.30.attn_q.q_output3Arscale = util.global.load @"__auto.blk.30.attn_q.q_output:rscale" : tensor<f32> | |
%724 = torch_c.from_builtin_tensor %__auto.blk.30.attn_q.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.30.attn_k.q_input3Arscale = util.global.load @"__auto.blk.30.attn_k.q_input:rscale" : tensor<f32> | |
%725 = torch_c.from_builtin_tensor %__auto.blk.30.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.30.attn_k.weight3Aqs = util.global.load @"__auto.blk.30.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ> | |
%726 = torch_c.from_builtin_tensor %__auto.blk.30.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ> | |
%__auto.blk.30.attn_k.q_output3Arscale = util.global.load @"__auto.blk.30.attn_k.q_output:rscale" : tensor<f32> | |
%727 = torch_c.from_builtin_tensor %__auto.blk.30.attn_k.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.30.attn_v.q_input3Arscale = util.global.load @"__auto.blk.30.attn_v.q_input:rscale" : tensor<f32> | |
%728 = torch_c.from_builtin_tensor %__auto.blk.30.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.30.attn_v.weight3Aqs = util.global.load @"__auto.blk.30.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ> | |
%729 = torch_c.from_builtin_tensor %__auto.blk.30.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ> | |
%__auto.blk.30.attn_v.q_output3Arscale = util.global.load @"__auto.blk.30.attn_v.q_output:rscale" : tensor<f32> | |
%730 = torch_c.from_builtin_tensor %__auto.blk.30.attn_v.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%731 = torch.vtensor.literal(dense<0.0883883461> : tensor<f32>) : !torch.vtensor<[],f32> | |
%__auto.blk.30.attn_output.q_input3Arscale = util.global.load @"__auto.blk.30.attn_output.q_input:rscale" : tensor<f32> | |
%732 = torch_c.from_builtin_tensor %__auto.blk.30.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.30.attn_output.weight3Aqs = util.global.load @"__auto.blk.30.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ> | |
%733 = torch_c.from_builtin_tensor %__auto.blk.30.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ> | |
%__auto.blk.30.attn_output.weight3Ad = util.global.load @"__auto.blk.30.attn_output.weight:d" : tensor<f32> | |
%734 = torch_c.from_builtin_tensor %__auto.blk.30.attn_output.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.30.ffn_norm.weight = util.global.load @__auto.blk.30.ffn_norm.weight : tensor<4096xbf16> | |
%735 = torch_c.from_builtin_tensor %__auto.blk.30.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16> | |
%__auto.blk.30.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.30.ffn_gate.q_input:rscale" : tensor<f32> | |
%736 = torch_c.from_builtin_tensor %__auto.blk.30.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.30.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.30.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ> | |
%737 = torch_c.from_builtin_tensor %__auto.blk.30.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ> | |
%__auto.blk.30.ffn_gate.weight3Ad = util.global.load @"__auto.blk.30.ffn_gate.weight:d" : tensor<f32> | |
%738 = torch_c.from_builtin_tensor %__auto.blk.30.ffn_gate.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.30.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.30.ffn_up.q_input:rscale" : tensor<f32> | |
%739 = torch_c.from_builtin_tensor %__auto.blk.30.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.30.ffn_up.weight3Aqs = util.global.load @"__auto.blk.30.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ> | |
%740 = torch_c.from_builtin_tensor %__auto.blk.30.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ> | |
%__auto.blk.30.ffn_up.weight3Ad = util.global.load @"__auto.blk.30.ffn_up.weight:d" : tensor<f32> | |
%741 = torch_c.from_builtin_tensor %__auto.blk.30.ffn_up.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.30.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.30.ffn_down.q_input:rscale" : tensor<f32> | |
%742 = torch_c.from_builtin_tensor %__auto.blk.30.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.30.ffn_down.weight3Aqs = util.global.load @"__auto.blk.30.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ> | |
%743 = torch_c.from_builtin_tensor %__auto.blk.30.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ> | |
%__auto.blk.30.ffn_down.weight3Ad = util.global.load @"__auto.blk.30.ffn_down.weight:d" : tensor<f32> | |
%744 = torch_c.from_builtin_tensor %__auto.blk.30.ffn_down.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.31.attn_norm.weight = util.global.load @__auto.blk.31.attn_norm.weight : tensor<4096xbf16> | |
%745 = torch_c.from_builtin_tensor %__auto.blk.31.attn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16> | |
%__auto.blk.31.attn_q.q_input3Arscale = util.global.load @"__auto.blk.31.attn_q.q_input:rscale" : tensor<f32> | |
%746 = torch_c.from_builtin_tensor %__auto.blk.31.attn_q.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.31.attn_q.weight3Aqs = util.global.load @"__auto.blk.31.attn_q.weight:qs" : tensor<4096x4096xf8E4M3FNUZ> | |
%747 = torch_c.from_builtin_tensor %__auto.blk.31.attn_q.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ> | |
%__auto.blk.31.attn_q.q_output3Arscale = util.global.load @"__auto.blk.31.attn_q.q_output:rscale" : tensor<f32> | |
%748 = torch_c.from_builtin_tensor %__auto.blk.31.attn_q.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.31.attn_k.q_input3Arscale = util.global.load @"__auto.blk.31.attn_k.q_input:rscale" : tensor<f32> | |
%749 = torch_c.from_builtin_tensor %__auto.blk.31.attn_k.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.31.attn_k.weight3Aqs = util.global.load @"__auto.blk.31.attn_k.weight:qs" : tensor<1024x4096xf8E4M3FNUZ> | |
%750 = torch_c.from_builtin_tensor %__auto.blk.31.attn_k.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ> | |
%__auto.blk.31.attn_k.q_output3Arscale = util.global.load @"__auto.blk.31.attn_k.q_output:rscale" : tensor<f32> | |
%751 = torch_c.from_builtin_tensor %__auto.blk.31.attn_k.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.31.attn_v.q_input3Arscale = util.global.load @"__auto.blk.31.attn_v.q_input:rscale" : tensor<f32> | |
%752 = torch_c.from_builtin_tensor %__auto.blk.31.attn_v.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.31.attn_v.weight3Aqs = util.global.load @"__auto.blk.31.attn_v.weight:qs" : tensor<1024x4096xf8E4M3FNUZ> | |
%753 = torch_c.from_builtin_tensor %__auto.blk.31.attn_v.weight3Aqs : tensor<1024x4096xf8E4M3FNUZ> -> !torch.vtensor<[1024,4096],f8E4M3FNUZ> | |
%__auto.blk.31.attn_v.q_output3Arscale = util.global.load @"__auto.blk.31.attn_v.q_output:rscale" : tensor<f32> | |
%754 = torch_c.from_builtin_tensor %__auto.blk.31.attn_v.q_output3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%755 = torch.vtensor.literal(dense<0.0883883461> : tensor<f32>) : !torch.vtensor<[],f32> | |
%__auto.blk.31.attn_output.q_input3Arscale = util.global.load @"__auto.blk.31.attn_output.q_input:rscale" : tensor<f32> | |
%756 = torch_c.from_builtin_tensor %__auto.blk.31.attn_output.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.31.attn_output.weight3Aqs = util.global.load @"__auto.blk.31.attn_output.weight:qs" : tensor<4096x4096xf8E4M3FNUZ> | |
%757 = torch_c.from_builtin_tensor %__auto.blk.31.attn_output.weight3Aqs : tensor<4096x4096xf8E4M3FNUZ> -> !torch.vtensor<[4096,4096],f8E4M3FNUZ> | |
%__auto.blk.31.attn_output.weight3Ad = util.global.load @"__auto.blk.31.attn_output.weight:d" : tensor<f32> | |
%758 = torch_c.from_builtin_tensor %__auto.blk.31.attn_output.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.31.ffn_norm.weight = util.global.load @__auto.blk.31.ffn_norm.weight : tensor<4096xbf16> | |
%759 = torch_c.from_builtin_tensor %__auto.blk.31.ffn_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16> | |
%__auto.blk.31.ffn_gate.q_input3Arscale = util.global.load @"__auto.blk.31.ffn_gate.q_input:rscale" : tensor<f32> | |
%760 = torch_c.from_builtin_tensor %__auto.blk.31.ffn_gate.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.31.ffn_gate.weight3Aqs = util.global.load @"__auto.blk.31.ffn_gate.weight:qs" : tensor<14336x4096xf8E4M3FNUZ> | |
%761 = torch_c.from_builtin_tensor %__auto.blk.31.ffn_gate.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ> | |
%__auto.blk.31.ffn_gate.weight3Ad = util.global.load @"__auto.blk.31.ffn_gate.weight:d" : tensor<f32> | |
%762 = torch_c.from_builtin_tensor %__auto.blk.31.ffn_gate.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.31.ffn_up.q_input3Arscale = util.global.load @"__auto.blk.31.ffn_up.q_input:rscale" : tensor<f32> | |
%763 = torch_c.from_builtin_tensor %__auto.blk.31.ffn_up.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.31.ffn_up.weight3Aqs = util.global.load @"__auto.blk.31.ffn_up.weight:qs" : tensor<14336x4096xf8E4M3FNUZ> | |
%764 = torch_c.from_builtin_tensor %__auto.blk.31.ffn_up.weight3Aqs : tensor<14336x4096xf8E4M3FNUZ> -> !torch.vtensor<[14336,4096],f8E4M3FNUZ> | |
%__auto.blk.31.ffn_up.weight3Ad = util.global.load @"__auto.blk.31.ffn_up.weight:d" : tensor<f32> | |
%765 = torch_c.from_builtin_tensor %__auto.blk.31.ffn_up.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.31.ffn_down.q_input3Arscale = util.global.load @"__auto.blk.31.ffn_down.q_input:rscale" : tensor<f32> | |
%766 = torch_c.from_builtin_tensor %__auto.blk.31.ffn_down.q_input3Arscale : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.blk.31.ffn_down.weight3Aqs = util.global.load @"__auto.blk.31.ffn_down.weight:qs" : tensor<4096x14336xf8E4M3FNUZ> | |
%767 = torch_c.from_builtin_tensor %__auto.blk.31.ffn_down.weight3Aqs : tensor<4096x14336xf8E4M3FNUZ> -> !torch.vtensor<[4096,14336],f8E4M3FNUZ> | |
%__auto.blk.31.ffn_down.weight3Ad = util.global.load @"__auto.blk.31.ffn_down.weight:d" : tensor<f32> | |
%768 = torch_c.from_builtin_tensor %__auto.blk.31.ffn_down.weight3Ad : tensor<f32> -> !torch.vtensor<[],f32> | |
%__auto.output_norm.weight = util.global.load @__auto.output_norm.weight : tensor<4096xbf16> | |
%769 = torch_c.from_builtin_tensor %__auto.output_norm.weight : tensor<4096xbf16> -> !torch.vtensor<[4096],bf16> | |
%__auto.output.weight = util.global.load @__auto.output.weight : tensor<128256x4096xbf16> | |
%770 = torch_c.from_builtin_tensor %__auto.output.weight : tensor<128256x4096xbf16> -> !torch.vtensor<[128256,4096],bf16> | |
%771 = torch.copy.to_vtensor %arg3 : !torch.vtensor<[?,2097152],f8E4M3FNUZ> | |
%772 = torch.symbolic_int "32*s1" {min_val = 64, max_val = 131040} : !torch.int | |
%773 = torch.symbolic_int "s1" {min_val = 2, max_val = 4095} : !torch.int | |
%774 = torch.symbolic_int "s2" {min_val = 0, max_val = 9223372036854775807} : !torch.int | |
torch.bind_symbolic_shape %arg0, [%773], affine_map<()[s0] -> (4, s0 * 32)> : !torch.vtensor<[4,?],si64> | |
torch.bind_symbolic_shape %arg2, [%773], affine_map<()[s0] -> (4, s0)> : !torch.vtensor<[4,?],si64> | |
torch.bind_symbolic_shape %771, [%774], affine_map<()[s0] -> (s0, 2097152)> : !torch.vtensor<[?,2097152],f8E4M3FNUZ> | |
%int1 = torch.constant.int 1 | |
%775 = torch.aten.size.int %arg2, %int1 : !torch.vtensor<[4,?],si64>, !torch.int -> !torch.int | |
%int0 = torch.constant.int 0 | |
%776 = torch.aten.size.int %771, %int0 : !torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.int -> !torch.int | |
%int1_0 = torch.constant.int 1 | |
%777 = torch.aten.size.int %arg0, %int1_0 : !torch.vtensor<[4,?],si64>, !torch.int -> !torch.int | |
%int0_1 = torch.constant.int 0 | |
%int1_2 = torch.constant.int 1 | |
%none = torch.constant.none | |
%none_3 = torch.constant.none | |
%cpu = torch.constant.device "cpu" | |
%false = torch.constant.bool false | |
%778 = torch.aten.arange.start_step %int0_1, %777, %int1_2, %none, %none_3, %cpu, %false : !torch.int, !torch.int, !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[?],si64> | |
torch.bind_symbolic_shape %778, [%773], affine_map<()[s0] -> (s0 * 32)> : !torch.vtensor<[?],si64> | |
%int-1 = torch.constant.int -1 | |
%779 = torch.aten.unsqueeze %arg1, %int-1 : !torch.vtensor<[4],si64>, !torch.int -> !torch.vtensor<[4,1],si64> | |
%780 = torch.aten.ge.Tensor %778, %779 : !torch.vtensor<[?],si64>, !torch.vtensor<[4,1],si64> -> !torch.vtensor<[4,?],i1> | |
torch.bind_symbolic_shape %780, [%773], affine_map<()[s0] -> (4, s0 * 32)> : !torch.vtensor<[4,?],i1> | |
%int1_4 = torch.constant.int 1 | |
%int1_5 = torch.constant.int 1 | |
%781 = torch.prim.ListConstruct %int1_4, %int1_5 : (!torch.int, !torch.int) -> !torch.list<int> | |
%int11 = torch.constant.int 11 | |
%none_6 = torch.constant.none | |
%cpu_7 = torch.constant.device "cpu" | |
%false_8 = torch.constant.bool false | |
%782 = torch.aten.ones %781, %int11, %none_6, %cpu_7, %false_8 : !torch.list<int>, !torch.int, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[1,1],i1> | |
%int131072 = torch.constant.int 131072 | |
%int131072_9 = torch.constant.int 131072 | |
%783 = torch.prim.ListConstruct %int131072, %int131072_9 : (!torch.int, !torch.int) -> !torch.list<int> | |
%false_10 = torch.constant.bool false | |
%784 = torch.aten.expand %782, %783, %false_10 : !torch.vtensor<[1,1],i1>, !torch.list<int>, !torch.bool -> !torch.vtensor<[131072,131072],i1> | |
%int1_11 = torch.constant.int 1 | |
%785 = torch.aten.triu %784, %int1_11 : !torch.vtensor<[131072,131072],i1>, !torch.int -> !torch.vtensor<[131072,131072],i1> | |
%int0_12 = torch.constant.int 0 | |
%786 = torch.aten.unsqueeze %785, %int0_12 : !torch.vtensor<[131072,131072],i1>, !torch.int -> !torch.vtensor<[1,131072,131072],i1> | |
%int1_13 = torch.constant.int 1 | |
%787 = torch.aten.unsqueeze %786, %int1_13 : !torch.vtensor<[1,131072,131072],i1>, !torch.int -> !torch.vtensor<[1,1,131072,131072],i1> | |
%int2 = torch.constant.int 2 | |
%int0_14 = torch.constant.int 0 | |
%int9223372036854775807 = torch.constant.int 9223372036854775807 | |
%int1_15 = torch.constant.int 1 | |
%788 = torch.aten.slice.Tensor %787, %int2, %int0_14, %int9223372036854775807, %int1_15 : !torch.vtensor<[1,1,131072,131072],i1>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,131072,131072],i1> | |
%int3 = torch.constant.int 3 | |
%int0_16 = torch.constant.int 0 | |
%int9223372036854775807_17 = torch.constant.int 9223372036854775807 | |
%int1_18 = torch.constant.int 1 | |
%789 = torch.aten.slice.Tensor %788, %int3, %int0_16, %int9223372036854775807_17, %int1_18 : !torch.vtensor<[1,1,131072,131072],i1>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,131072,131072],i1> | |
%int0_19 = torch.constant.int 0 | |
%int0_20 = torch.constant.int 0 | |
%int9223372036854775807_21 = torch.constant.int 9223372036854775807 | |
%int1_22 = torch.constant.int 1 | |
%790 = torch.aten.slice.Tensor %789, %int0_19, %int0_20, %int9223372036854775807_21, %int1_22 : !torch.vtensor<[1,1,131072,131072],i1>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,131072,131072],i1> | |
%int1_23 = torch.constant.int 1 | |
%int0_24 = torch.constant.int 0 | |
%int9223372036854775807_25 = torch.constant.int 9223372036854775807 | |
%int1_26 = torch.constant.int 1 | |
%791 = torch.aten.slice.Tensor %790, %int1_23, %int0_24, %int9223372036854775807_25, %int1_26 : !torch.vtensor<[1,1,131072,131072],i1>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,131072,131072],i1> | |
%int2_27 = torch.constant.int 2 | |
%int0_28 = torch.constant.int 0 | |
%int1_29 = torch.constant.int 1 | |
%792 = torch.aten.slice.Tensor %791, %int2_27, %int0_28, %777, %int1_29 : !torch.vtensor<[1,1,131072,131072],i1>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,?,131072],i1> | |
torch.bind_symbolic_shape %792, [%773], affine_map<()[s0] -> (1, 1, s0 * 32, 131072)> : !torch.vtensor<[1,1,?,131072],i1> | |
%int3_30 = torch.constant.int 3 | |
%int0_31 = torch.constant.int 0 | |
%int1_32 = torch.constant.int 1 | |
%793 = torch.aten.slice.Tensor %792, %int3_30, %int0_31, %777, %int1_32 : !torch.vtensor<[1,1,?,131072],i1>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,?,?],i1> | |
torch.bind_symbolic_shape %793, [%773], affine_map<()[s0] -> (1, 1, s0 * 32, s0 * 32)> : !torch.vtensor<[1,1,?,?],i1> | |
%int0_33 = torch.constant.int 0 | |
%int0_34 = torch.constant.int 0 | |
%int9223372036854775807_35 = torch.constant.int 9223372036854775807 | |
%int1_36 = torch.constant.int 1 | |
%794 = torch.aten.slice.Tensor %780, %int0_33, %int0_34, %int9223372036854775807_35, %int1_36 : !torch.vtensor<[4,?],i1>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[4,?],i1> | |
torch.bind_symbolic_shape %794, [%773], affine_map<()[s0] -> (4, s0 * 32)> : !torch.vtensor<[4,?],i1> | |
%int1_37 = torch.constant.int 1 | |
%795 = torch.aten.unsqueeze %794, %int1_37 : !torch.vtensor<[4,?],i1>, !torch.int -> !torch.vtensor<[4,1,?],i1> | |
torch.bind_symbolic_shape %795, [%773], affine_map<()[s0] -> (4, 1, s0 * 32)> : !torch.vtensor<[4,1,?],i1> | |
%int2_38 = torch.constant.int 2 | |
%796 = torch.aten.unsqueeze %795, %int2_38 : !torch.vtensor<[4,1,?],i1>, !torch.int -> !torch.vtensor<[4,1,1,?],i1> | |
torch.bind_symbolic_shape %796, [%773], affine_map<()[s0] -> (4, 1, 1, s0 * 32)> : !torch.vtensor<[4,1,1,?],i1> | |
%int3_39 = torch.constant.int 3 | |
%int0_40 = torch.constant.int 0 | |
%int9223372036854775807_41 = torch.constant.int 9223372036854775807 | |
%int1_42 = torch.constant.int 1 | |
%797 = torch.aten.slice.Tensor %796, %int3_39, %int0_40, %int9223372036854775807_41, %int1_42 : !torch.vtensor<[4,1,1,?],i1>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[4,1,1,?],i1> | |
torch.bind_symbolic_shape %797, [%773], affine_map<()[s0] -> (4, 1, 1, s0 * 32)> : !torch.vtensor<[4,1,1,?],i1> | |
%798 = torch.aten.logical_or %793, %797 : !torch.vtensor<[1,1,?,?],i1>, !torch.vtensor<[4,1,1,?],i1> -> !torch.vtensor<[4,1,?,?],i1> | |
torch.bind_symbolic_shape %798, [%773], affine_map<()[s0] -> (4, 1, s0 * 32, s0 * 32)> : !torch.vtensor<[4,1,?,?],i1> | |
%int0_43 = torch.constant.int 0 | |
%int6 = torch.constant.int 6 | |
%int0_44 = torch.constant.int 0 | |
%cpu_45 = torch.constant.device "cpu" | |
%none_46 = torch.constant.none | |
%799 = torch.aten.scalar_tensor %int0_43, %int6, %int0_44, %cpu_45, %none_46 : !torch.int, !torch.int, !torch.int, !torch.Device, !torch.none -> !torch.vtensor<[],f32> | |
%float-Inf = torch.constant.float 0xFFF0000000000000 | |
%int6_47 = torch.constant.int 6 | |
%int0_48 = torch.constant.int 0 | |
%cpu_49 = torch.constant.device "cpu" | |
%none_50 = torch.constant.none | |
%800 = torch.aten.scalar_tensor %float-Inf, %int6_47, %int0_48, %cpu_49, %none_50 : !torch.float, !torch.int, !torch.int, !torch.Device, !torch.none -> !torch.vtensor<[],f32> | |
%801 = torch.aten.where.self %798, %800, %799 : !torch.vtensor<[4,1,?,?],i1>, !torch.vtensor<[],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,1,?,?],f32> | |
torch.bind_symbolic_shape %801, [%773], affine_map<()[s0] -> (4, 1, s0 * 32, s0 * 32)> : !torch.vtensor<[4,1,?,?],f32> | |
%int6_51 = torch.constant.int 6 | |
%802 = torch.prims.convert_element_type %801, %int6_51 : !torch.vtensor<[4,1,?,?],f32>, !torch.int -> !torch.vtensor<[4,1,?,?],f32> | |
torch.bind_symbolic_shape %802, [%773], affine_map<()[s0] -> (4, 1, s0 * 32, s0 * 32)> : !torch.vtensor<[4,1,?,?],f32> | |
%int6_52 = torch.constant.int 6 | |
%803 = torch.prims.convert_element_type %802, %int6_52 : !torch.vtensor<[4,1,?,?],f32>, !torch.int -> !torch.vtensor<[4,1,?,?],f32> | |
torch.bind_symbolic_shape %803, [%773], affine_map<()[s0] -> (4, 1, s0 * 32, s0 * 32)> : !torch.vtensor<[4,1,?,?],f32> | |
%int15 = torch.constant.int 15 | |
%804 = torch.prims.convert_element_type %0, %int15 : !torch.vtensor<[128256,4096],bf16>, !torch.int -> !torch.vtensor<[128256,4096],bf16> | |
%int-1_53 = torch.constant.int -1 | |
%false_54 = torch.constant.bool false | |
%false_55 = torch.constant.bool false | |
%805 = torch.aten.embedding %804, %arg0, %int-1_53, %false_54, %false_55 : !torch.vtensor<[128256,4096],bf16>, !torch.vtensor<[4,?],si64>, !torch.int, !torch.bool, !torch.bool -> !torch.vtensor<[4,?,4096],bf16> | |
torch.bind_symbolic_shape %805, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],bf16> | |
%int6_56 = torch.constant.int 6 | |
%806 = torch.prims.convert_element_type %805, %int6_56 : !torch.vtensor<[4,?,4096],bf16>, !torch.int -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %806, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int2_57 = torch.constant.int 2 | |
%807 = torch.aten.pow.Tensor_Scalar %806, %int2_57 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %807, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int-1_58 = torch.constant.int -1 | |
%808 = torch.prim.ListConstruct %int-1_58 : (!torch.int) -> !torch.list<int> | |
%true = torch.constant.bool true | |
%none_59 = torch.constant.none | |
%809 = torch.aten.mean.dim %807, %808, %true, %none_59 : !torch.vtensor<[4,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[4,?,1],f32> | |
torch.bind_symbolic_shape %809, [%773], affine_map<()[s0] -> (4, s0 * 32, 1)> : !torch.vtensor<[4,?,1],f32> | |
%float1.000000e-05 = torch.constant.float 1.000000e-05 | |
%int1_60 = torch.constant.int 1 | |
%810 = torch.aten.add.Scalar %809, %float1.000000e-05, %int1_60 : !torch.vtensor<[4,?,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[4,?,1],f32> | |
torch.bind_symbolic_shape %810, [%773], affine_map<()[s0] -> (4, s0 * 32, 1)> : !torch.vtensor<[4,?,1],f32> | |
%811 = torch.aten.rsqrt %810 : !torch.vtensor<[4,?,1],f32> -> !torch.vtensor<[4,?,1],f32> | |
torch.bind_symbolic_shape %811, [%773], affine_map<()[s0] -> (4, s0 * 32, 1)> : !torch.vtensor<[4,?,1],f32> | |
%812 = torch.aten.mul.Tensor %806, %811 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,1],f32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %812, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int15_61 = torch.constant.int 15 | |
%813 = torch.prims.convert_element_type %812, %int15_61 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],bf16> | |
torch.bind_symbolic_shape %813, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],bf16> | |
%814 = torch.aten.mul.Tensor %1, %813 : !torch.vtensor<[4096],bf16>, !torch.vtensor<[4,?,4096],bf16> -> !torch.vtensor<[4,?,4096],bf16> | |
torch.bind_symbolic_shape %814, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],bf16> | |
%int15_62 = torch.constant.int 15 | |
%815 = torch.prims.convert_element_type %814, %int15_62 : !torch.vtensor<[4,?,4096],bf16>, !torch.int -> !torch.vtensor<[4,?,4096],bf16> | |
torch.bind_symbolic_shape %815, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],bf16> | |
%816 = torch.aten.div.Tensor %815, %2 : !torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],bf16> | |
torch.bind_symbolic_shape %816, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],bf16> | |
%float-2.400000e02 = torch.constant.float -2.400000e+02 | |
%float2.400000e02 = torch.constant.float 2.400000e+02 | |
%817 = torch.aten.clamp %816, %float-2.400000e02, %float2.400000e02 : !torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float -> !torch.vtensor<[4,?,4096],bf16> | |
torch.bind_symbolic_shape %817, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],bf16> | |
%int26 = torch.constant.int 26 | |
%818 = torch.prims.convert_element_type %817, %int26 : !torch.vtensor<[4,?,4096],bf16>, !torch.int -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %818, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> | |
%int0_63 = torch.constant.int 0 | |
%819 = torch.aten.unsqueeze %3, %int0_63 : !torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ> | |
%int4 = torch.constant.int 4 | |
%int4096 = torch.constant.int 4096 | |
%int4096_64 = torch.constant.int 4096 | |
%820 = torch.prim.ListConstruct %int4, %int4096, %int4096_64 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%false_65 = torch.constant.bool false | |
%821 = torch.aten.expand %819, %820, %false_65 : !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ> | |
%822 = torch_c.to_builtin_tensor %818 : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> -> tensor<4x?x4096xf8E4M3FNUZ> | |
%823 = torch_c.to_builtin_tensor %821 : !torch.vtensor<[4,4096,4096],f8E4M3FNUZ> -> tensor<4x4096x4096xf8E4M3FNUZ> | |
%824 = util.call @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ_f32(%822, %823) : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x?x4096xf32> | |
%825 = torch_c.from_builtin_tensor %824 : tensor<4x?x4096xf32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %825, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%826 = torch.aten.div.Tensor %825, %4 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %826, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%float-2.400000e02_66 = torch.constant.float -2.400000e+02 | |
%float2.400000e02_67 = torch.constant.float 2.400000e+02 | |
%827 = torch.aten.clamp %826, %float-2.400000e02_66, %float2.400000e02_67 : !torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %827, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int26_68 = torch.constant.int 26 | |
%828 = torch.prims.convert_element_type %827, %int26_68 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %828, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> | |
%829 = torch.aten.div.Tensor %815, %5 : !torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],bf16> | |
torch.bind_symbolic_shape %829, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],bf16> | |
%float-2.400000e02_69 = torch.constant.float -2.400000e+02 | |
%float2.400000e02_70 = torch.constant.float 2.400000e+02 | |
%830 = torch.aten.clamp %829, %float-2.400000e02_69, %float2.400000e02_70 : !torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float -> !torch.vtensor<[4,?,4096],bf16> | |
torch.bind_symbolic_shape %830, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],bf16> | |
%int26_71 = torch.constant.int 26 | |
%831 = torch.prims.convert_element_type %830, %int26_71 : !torch.vtensor<[4,?,4096],bf16>, !torch.int -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %831, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> | |
%int0_72 = torch.constant.int 0 | |
%832 = torch.aten.unsqueeze %6, %int0_72 : !torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ> | |
%int4_73 = torch.constant.int 4 | |
%int1024 = torch.constant.int 1024 | |
%int4096_74 = torch.constant.int 4096 | |
%833 = torch.prim.ListConstruct %int4_73, %int1024, %int4096_74 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%false_75 = torch.constant.bool false | |
%834 = torch.aten.expand %832, %833, %false_75 : !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ> | |
%835 = torch_c.to_builtin_tensor %831 : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> -> tensor<4x?x4096xf8E4M3FNUZ> | |
%836 = torch_c.to_builtin_tensor %834 : !torch.vtensor<[4,1024,4096],f8E4M3FNUZ> -> tensor<4x1024x4096xf8E4M3FNUZ> | |
%837 = util.call @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ_f32(%835, %836) : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x?x1024xf32> | |
%838 = torch_c.from_builtin_tensor %837 : tensor<4x?x1024xf32> -> !torch.vtensor<[4,?,1024],f32> | |
torch.bind_symbolic_shape %838, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f32> | |
%839 = torch.aten.div.Tensor %838, %7 : !torch.vtensor<[4,?,1024],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,1024],f32> | |
torch.bind_symbolic_shape %839, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f32> | |
%float-2.400000e02_76 = torch.constant.float -2.400000e+02 | |
%float2.400000e02_77 = torch.constant.float 2.400000e+02 | |
%840 = torch.aten.clamp %839, %float-2.400000e02_76, %float2.400000e02_77 : !torch.vtensor<[4,?,1024],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,1024],f32> | |
torch.bind_symbolic_shape %840, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f32> | |
%int26_78 = torch.constant.int 26 | |
%841 = torch.prims.convert_element_type %840, %int26_78 : !torch.vtensor<[4,?,1024],f32>, !torch.int -> !torch.vtensor<[4,?,1024],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %841, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f8E4M3FNUZ> | |
%842 = torch.aten.div.Tensor %815, %8 : !torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],bf16> | |
torch.bind_symbolic_shape %842, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],bf16> | |
%float-2.400000e02_79 = torch.constant.float -2.400000e+02 | |
%float2.400000e02_80 = torch.constant.float 2.400000e+02 | |
%843 = torch.aten.clamp %842, %float-2.400000e02_79, %float2.400000e02_80 : !torch.vtensor<[4,?,4096],bf16>, !torch.float, !torch.float -> !torch.vtensor<[4,?,4096],bf16> | |
torch.bind_symbolic_shape %843, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],bf16> | |
%int26_81 = torch.constant.int 26 | |
%844 = torch.prims.convert_element_type %843, %int26_81 : !torch.vtensor<[4,?,4096],bf16>, !torch.int -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %844, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> | |
%int0_82 = torch.constant.int 0 | |
%845 = torch.aten.unsqueeze %9, %int0_82 : !torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ> | |
%int4_83 = torch.constant.int 4 | |
%int1024_84 = torch.constant.int 1024 | |
%int4096_85 = torch.constant.int 4096 | |
%846 = torch.prim.ListConstruct %int4_83, %int1024_84, %int4096_85 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%false_86 = torch.constant.bool false | |
%847 = torch.aten.expand %845, %846, %false_86 : !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ> | |
%848 = torch_c.to_builtin_tensor %844 : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> -> tensor<4x?x4096xf8E4M3FNUZ> | |
%849 = torch_c.to_builtin_tensor %847 : !torch.vtensor<[4,1024,4096],f8E4M3FNUZ> -> tensor<4x1024x4096xf8E4M3FNUZ> | |
%850 = util.call @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ_f32(%848, %849) : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x?x1024xf32> | |
%851 = torch_c.from_builtin_tensor %850 : tensor<4x?x1024xf32> -> !torch.vtensor<[4,?,1024],f32> | |
torch.bind_symbolic_shape %851, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f32> | |
%852 = torch.aten.div.Tensor %851, %10 : !torch.vtensor<[4,?,1024],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,1024],f32> | |
torch.bind_symbolic_shape %852, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f32> | |
%float-2.400000e02_87 = torch.constant.float -2.400000e+02 | |
%float2.400000e02_88 = torch.constant.float 2.400000e+02 | |
%853 = torch.aten.clamp %852, %float-2.400000e02_87, %float2.400000e02_88 : !torch.vtensor<[4,?,1024],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,1024],f32> | |
torch.bind_symbolic_shape %853, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f32> | |
%int26_89 = torch.constant.int 26 | |
%854 = torch.prims.convert_element_type %853, %int26_89 : !torch.vtensor<[4,?,1024],f32>, !torch.int -> !torch.vtensor<[4,?,1024],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %854, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f8E4M3FNUZ> | |
%int4_90 = torch.constant.int 4 | |
%int32 = torch.constant.int 32 | |
%int128 = torch.constant.int 128 | |
%855 = torch.prim.ListConstruct %int4_90, %777, %int32, %int128 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%856 = torch.aten.view %828, %855 : !torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %856, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ> | |
%int4_91 = torch.constant.int 4 | |
%int8 = torch.constant.int 8 | |
%int128_92 = torch.constant.int 128 | |
%857 = torch.prim.ListConstruct %int4_91, %777, %int8, %int128_92 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%858 = torch.aten.view %841, %857 : !torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %858, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 128)> : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ> | |
%int4_93 = torch.constant.int 4 | |
%int8_94 = torch.constant.int 8 | |
%int128_95 = torch.constant.int 128 | |
%859 = torch.prim.ListConstruct %int4_93, %777, %int8_94, %int128_95 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%860 = torch.aten.view %854, %859 : !torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %860, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 128)> : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ> | |
%int131072_96 = torch.constant.int 131072 | |
%none_97 = torch.constant.none | |
%none_98 = torch.constant.none | |
%cpu_99 = torch.constant.device "cpu" | |
%false_100 = torch.constant.bool false | |
%861 = torch.aten.arange %int131072_96, %none_97, %none_98, %cpu_99, %false_100 : !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[131072],si64> | |
%int0_101 = torch.constant.int 0 | |
%int128_102 = torch.constant.int 128 | |
%int2_103 = torch.constant.int 2 | |
%int4_104 = torch.constant.int 4 | |
%none_105 = torch.constant.none | |
%cpu_106 = torch.constant.device "cpu" | |
%false_107 = torch.constant.bool false | |
%862 = torch.aten.arange.start_step %int0_101, %int128_102, %int2_103, %int4_104, %none_105, %cpu_106, %false_107 : !torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[64],si64> | |
%int6_108 = torch.constant.int 6 | |
%863 = torch.prims.convert_element_type %862, %int6_108 : !torch.vtensor<[64],si64>, !torch.int -> !torch.vtensor<[64],f32> | |
%int128_109 = torch.constant.int 128 | |
%864 = torch.aten.div.Scalar %863, %int128_109 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32> | |
%float5.000000e05 = torch.constant.float 5.000000e+05 | |
%865 = torch.aten.pow.Scalar %float5.000000e05, %864 : !torch.float, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32> | |
%866 = torch.aten.reciprocal %865 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32> | |
%float1.000000e00 = torch.constant.float 1.000000e+00 | |
%867 = torch.aten.mul.Scalar %866, %float1.000000e00 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],f32> | |
%868 = torch.aten.reciprocal %867 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32> | |
%float6.283190e00 = torch.constant.float 6.2831853071795862 | |
%869 = torch.aten.mul.Scalar %868, %float6.283190e00 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],f32> | |
%float8.192000e03 = torch.constant.float 8.192000e+03 | |
%870 = torch.aten.gt.Scalar %869, %float8.192000e03 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1> | |
%int8_110 = torch.constant.int 8 | |
%871 = torch.aten.div.Scalar %867, %int8_110 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32> | |
%872 = torch.aten.where.self %870, %871, %867 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32> | |
%873 = torch.aten.reciprocal %869 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32> | |
%int8192 = torch.constant.int 8192 | |
%874 = torch.aten.mul.Scalar %873, %int8192 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32> | |
%int1_111 = torch.constant.int 1 | |
%int1_112 = torch.constant.int 1 | |
%875 = torch.aten.sub.Scalar %874, %int1_111, %int1_112 : !torch.vtensor<[64],f32>, !torch.int, !torch.int -> !torch.vtensor<[64],f32> | |
%int3_113 = torch.constant.int 3 | |
%876 = torch.aten.div.Scalar %875, %int3_113 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32> | |
%int1_114 = torch.constant.int 1 | |
%int1_115 = torch.constant.int 1 | |
%877 = torch.aten.rsub.Scalar %876, %int1_114, %int1_115 : !torch.vtensor<[64],f32>, !torch.int, !torch.int -> !torch.vtensor<[64],f32> | |
%878 = torch.aten.mul.Tensor %877, %872 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32> | |
%int8_116 = torch.constant.int 8 | |
%879 = torch.aten.div.Scalar %878, %int8_116 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32> | |
%880 = torch.aten.mul.Tensor %876, %872 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32> | |
%int1_117 = torch.constant.int 1 | |
%881 = torch.aten.add.Tensor %879, %880, %int1_117 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32> | |
%float2.048000e03 = torch.constant.float 2.048000e+03 | |
%882 = torch.aten.lt.Scalar %869, %float2.048000e03 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1> | |
%883 = torch.aten.bitwise_not %882 : !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1> | |
%float8.192000e03_118 = torch.constant.float 8.192000e+03 | |
%884 = torch.aten.gt.Scalar %869, %float8.192000e03_118 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1> | |
%885 = torch.aten.bitwise_not %884 : !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1> | |
%886 = torch.aten.mul.Tensor %883, %885 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1> | |
%887 = torch.aten.where.self %886, %881, %872 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32> | |
%888 = torch.prim.ListConstruct %887, %887 : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor> | |
%int-1_119 = torch.constant.int -1 | |
%889 = torch.aten.cat %888, %int-1_119 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[128],f32> | |
%int6_120 = torch.constant.int 6 | |
%890 = torch.prims.convert_element_type %889, %int6_120 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[128],f32> | |
%int1_121 = torch.constant.int 1 | |
%891 = torch.aten.unsqueeze %861, %int1_121 : !torch.vtensor<[131072],si64>, !torch.int -> !torch.vtensor<[131072,1],si64> | |
%int6_122 = torch.constant.int 6 | |
%892 = torch.prims.convert_element_type %891, %int6_122 : !torch.vtensor<[131072,1],si64>, !torch.int -> !torch.vtensor<[131072,1],f32> | |
%int0_123 = torch.constant.int 0 | |
%893 = torch.aten.unsqueeze %890, %int0_123 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[1,128],f32> | |
%int6_124 = torch.constant.int 6 | |
%894 = torch.prims.convert_element_type %893, %int6_124 : !torch.vtensor<[1,128],f32>, !torch.int -> !torch.vtensor<[1,128],f32> | |
%895 = torch.aten.mul.Tensor %892, %894 : !torch.vtensor<[131072,1],f32>, !torch.vtensor<[1,128],f32> -> !torch.vtensor<[131072,128],f32> | |
%896 = torch.aten.cos %895 : !torch.vtensor<[131072,128],f32> -> !torch.vtensor<[131072,128],f32> | |
%int15_125 = torch.constant.int 15 | |
%897 = torch.prims.convert_element_type %896, %int15_125 : !torch.vtensor<[131072,128],f32>, !torch.int -> !torch.vtensor<[131072,128],bf16> | |
%898 = torch.aten.sin %895 : !torch.vtensor<[131072,128],f32> -> !torch.vtensor<[131072,128],f32> | |
%int15_126 = torch.constant.int 15 | |
%899 = torch.prims.convert_element_type %898, %int15_126 : !torch.vtensor<[131072,128],f32>, !torch.int -> !torch.vtensor<[131072,128],bf16> | |
%int0_127 = torch.constant.int 0 | |
%int0_128 = torch.constant.int 0 | |
%int1_129 = torch.constant.int 1 | |
%900 = torch.aten.slice.Tensor %897, %int0_127, %int0_128, %777, %int1_129 : !torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16> | |
torch.bind_symbolic_shape %900, [%773], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16> | |
%int1_130 = torch.constant.int 1 | |
%int0_131 = torch.constant.int 0 | |
%int9223372036854775807_132 = torch.constant.int 9223372036854775807 | |
%int1_133 = torch.constant.int 1 | |
%901 = torch.aten.slice.Tensor %900, %int1_130, %int0_131, %int9223372036854775807_132, %int1_133 : !torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16> | |
torch.bind_symbolic_shape %901, [%773], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16> | |
%int0_134 = torch.constant.int 0 | |
%int0_135 = torch.constant.int 0 | |
%int1_136 = torch.constant.int 1 | |
%902 = torch.aten.slice.Tensor %899, %int0_134, %int0_135, %777, %int1_136 : !torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16> | |
torch.bind_symbolic_shape %902, [%773], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16> | |
%int1_137 = torch.constant.int 1 | |
%int0_138 = torch.constant.int 0 | |
%int9223372036854775807_139 = torch.constant.int 9223372036854775807 | |
%int1_140 = torch.constant.int 1 | |
%903 = torch.aten.slice.Tensor %902, %int1_137, %int0_138, %int9223372036854775807_139, %int1_140 : !torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16> | |
torch.bind_symbolic_shape %903, [%773], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16> | |
%int0_141 = torch.constant.int 0 | |
%904 = torch.aten.unsqueeze %901, %int0_141 : !torch.vtensor<[?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,128],bf16> | |
torch.bind_symbolic_shape %904, [%773], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16> | |
%int1_142 = torch.constant.int 1 | |
%int0_143 = torch.constant.int 0 | |
%int9223372036854775807_144 = torch.constant.int 9223372036854775807 | |
%int1_145 = torch.constant.int 1 | |
%905 = torch.aten.slice.Tensor %904, %int1_142, %int0_143, %int9223372036854775807_144, %int1_145 : !torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],bf16> | |
torch.bind_symbolic_shape %905, [%773], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16> | |
%int2_146 = torch.constant.int 2 | |
%906 = torch.aten.unsqueeze %905, %int2_146 : !torch.vtensor<[1,?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,1,128],bf16> | |
torch.bind_symbolic_shape %906, [%773], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16> | |
%int3_147 = torch.constant.int 3 | |
%int0_148 = torch.constant.int 0 | |
%int9223372036854775807_149 = torch.constant.int 9223372036854775807 | |
%int1_150 = torch.constant.int 1 | |
%907 = torch.aten.slice.Tensor %906, %int3_147, %int0_148, %int9223372036854775807_149, %int1_150 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,1,128],bf16> | |
torch.bind_symbolic_shape %907, [%773], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16> | |
%int4_151 = torch.constant.int 4 | |
%int1_152 = torch.constant.int 1 | |
%int1_153 = torch.constant.int 1 | |
%int1_154 = torch.constant.int 1 | |
%908 = torch.prim.ListConstruct %int4_151, %int1_152, %int1_153, %int1_154 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%909 = torch.aten.repeat %907, %908 : !torch.vtensor<[1,?,1,128],bf16>, !torch.list<int> -> !torch.vtensor<[4,?,1,128],bf16> | |
torch.bind_symbolic_shape %909, [%773], affine_map<()[s0] -> (4, s0 * 32, 1, 128)> : !torch.vtensor<[4,?,1,128],bf16> | |
%int0_155 = torch.constant.int 0 | |
%910 = torch.aten.unsqueeze %903, %int0_155 : !torch.vtensor<[?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,128],bf16> | |
torch.bind_symbolic_shape %910, [%773], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16> | |
%int1_156 = torch.constant.int 1 | |
%int0_157 = torch.constant.int 0 | |
%int9223372036854775807_158 = torch.constant.int 9223372036854775807 | |
%int1_159 = torch.constant.int 1 | |
%911 = torch.aten.slice.Tensor %910, %int1_156, %int0_157, %int9223372036854775807_158, %int1_159 : !torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],bf16> | |
torch.bind_symbolic_shape %911, [%773], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16> | |
%int2_160 = torch.constant.int 2 | |
%912 = torch.aten.unsqueeze %911, %int2_160 : !torch.vtensor<[1,?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,1,128],bf16> | |
torch.bind_symbolic_shape %912, [%773], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16> | |
%int3_161 = torch.constant.int 3 | |
%int0_162 = torch.constant.int 0 | |
%int9223372036854775807_163 = torch.constant.int 9223372036854775807 | |
%int1_164 = torch.constant.int 1 | |
%913 = torch.aten.slice.Tensor %912, %int3_161, %int0_162, %int9223372036854775807_163, %int1_164 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,1,128],bf16> | |
torch.bind_symbolic_shape %913, [%773], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16> | |
%int4_165 = torch.constant.int 4 | |
%int1_166 = torch.constant.int 1 | |
%int1_167 = torch.constant.int 1 | |
%int1_168 = torch.constant.int 1 | |
%914 = torch.prim.ListConstruct %int4_165, %int1_166, %int1_167, %int1_168 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%915 = torch.aten.repeat %913, %914 : !torch.vtensor<[1,?,1,128],bf16>, !torch.list<int> -> !torch.vtensor<[4,?,1,128],bf16> | |
torch.bind_symbolic_shape %915, [%773], affine_map<()[s0] -> (4, s0 * 32, 1, 128)> : !torch.vtensor<[4,?,1,128],bf16> | |
%916 = torch.aten.mul.Tensor %856, %909 : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16> -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %916, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ> | |
%int3_169 = torch.constant.int 3 | |
%int0_170 = torch.constant.int 0 | |
%int64 = torch.constant.int 64 | |
%int1_171 = torch.constant.int 1 | |
%917 = torch.aten.slice.Tensor %856, %int3_169, %int0_170, %int64, %int1_171 : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %917, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 64)> : !torch.vtensor<[4,?,32,64],f8E4M3FNUZ> | |
%int3_172 = torch.constant.int 3 | |
%int64_173 = torch.constant.int 64 | |
%int9223372036854775807_174 = torch.constant.int 9223372036854775807 | |
%int1_175 = torch.constant.int 1 | |
%918 = torch.aten.slice.Tensor %856, %int3_172, %int64_173, %int9223372036854775807_174, %int1_175 : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %918, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 64)> : !torch.vtensor<[4,?,32,64],f8E4M3FNUZ> | |
%919 = torch.aten.neg %918 : !torch.vtensor<[4,?,32,64],f8E4M3FNUZ> -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %919, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 64)> : !torch.vtensor<[4,?,32,64],f8E4M3FNUZ> | |
%920 = torch.prim.ListConstruct %919, %917 : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>) -> !torch.list<vtensor> | |
%int-1_176 = torch.constant.int -1 | |
%921 = torch.aten.cat %920, %int-1_176 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %921, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ> | |
%922 = torch.aten.mul.Tensor %921, %915 : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16> -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %922, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ> | |
%int1_177 = torch.constant.int 1 | |
%923 = torch.aten.add.Tensor %916, %922, %int1_177 : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %923, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ> | |
%int131072_178 = torch.constant.int 131072 | |
%none_179 = torch.constant.none | |
%none_180 = torch.constant.none | |
%cpu_181 = torch.constant.device "cpu" | |
%false_182 = torch.constant.bool false | |
%924 = torch.aten.arange %int131072_178, %none_179, %none_180, %cpu_181, %false_182 : !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[131072],si64> | |
%int0_183 = torch.constant.int 0 | |
%int128_184 = torch.constant.int 128 | |
%int2_185 = torch.constant.int 2 | |
%int4_186 = torch.constant.int 4 | |
%none_187 = torch.constant.none | |
%cpu_188 = torch.constant.device "cpu" | |
%false_189 = torch.constant.bool false | |
%925 = torch.aten.arange.start_step %int0_183, %int128_184, %int2_185, %int4_186, %none_187, %cpu_188, %false_189 : !torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[64],si64> | |
%int6_190 = torch.constant.int 6 | |
%926 = torch.prims.convert_element_type %925, %int6_190 : !torch.vtensor<[64],si64>, !torch.int -> !torch.vtensor<[64],f32> | |
%int128_191 = torch.constant.int 128 | |
%927 = torch.aten.div.Scalar %926, %int128_191 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32> | |
%float5.000000e05_192 = torch.constant.float 5.000000e+05 | |
%928 = torch.aten.pow.Scalar %float5.000000e05_192, %927 : !torch.float, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32> | |
%929 = torch.aten.reciprocal %928 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32> | |
%float1.000000e00_193 = torch.constant.float 1.000000e+00 | |
%930 = torch.aten.mul.Scalar %929, %float1.000000e00_193 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],f32> | |
%931 = torch.aten.reciprocal %930 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32> | |
%float6.283190e00_194 = torch.constant.float 6.2831853071795862 | |
%932 = torch.aten.mul.Scalar %931, %float6.283190e00_194 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],f32> | |
%float8.192000e03_195 = torch.constant.float 8.192000e+03 | |
%933 = torch.aten.gt.Scalar %932, %float8.192000e03_195 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1> | |
%int8_196 = torch.constant.int 8 | |
%934 = torch.aten.div.Scalar %930, %int8_196 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32> | |
%935 = torch.aten.where.self %933, %934, %930 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32> | |
%936 = torch.aten.reciprocal %932 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32> | |
%int8192_197 = torch.constant.int 8192 | |
%937 = torch.aten.mul.Scalar %936, %int8192_197 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32> | |
%int1_198 = torch.constant.int 1 | |
%int1_199 = torch.constant.int 1 | |
%938 = torch.aten.sub.Scalar %937, %int1_198, %int1_199 : !torch.vtensor<[64],f32>, !torch.int, !torch.int -> !torch.vtensor<[64],f32> | |
%int3_200 = torch.constant.int 3 | |
%939 = torch.aten.div.Scalar %938, %int3_200 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32> | |
%int1_201 = torch.constant.int 1 | |
%int1_202 = torch.constant.int 1 | |
%940 = torch.aten.rsub.Scalar %939, %int1_201, %int1_202 : !torch.vtensor<[64],f32>, !torch.int, !torch.int -> !torch.vtensor<[64],f32> | |
%941 = torch.aten.mul.Tensor %940, %935 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32> | |
%int8_203 = torch.constant.int 8 | |
%942 = torch.aten.div.Scalar %941, %int8_203 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32> | |
%943 = torch.aten.mul.Tensor %939, %935 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32> | |
%int1_204 = torch.constant.int 1 | |
%944 = torch.aten.add.Tensor %942, %943, %int1_204 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32> | |
%float2.048000e03_205 = torch.constant.float 2.048000e+03 | |
%945 = torch.aten.lt.Scalar %932, %float2.048000e03_205 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1> | |
%946 = torch.aten.bitwise_not %945 : !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1> | |
%float8.192000e03_206 = torch.constant.float 8.192000e+03 | |
%947 = torch.aten.gt.Scalar %932, %float8.192000e03_206 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1> | |
%948 = torch.aten.bitwise_not %947 : !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1> | |
%949 = torch.aten.mul.Tensor %946, %948 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1> | |
%950 = torch.aten.where.self %949, %944, %935 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32> | |
%951 = torch.prim.ListConstruct %950, %950 : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor> | |
%int-1_207 = torch.constant.int -1 | |
%952 = torch.aten.cat %951, %int-1_207 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[128],f32> | |
%int6_208 = torch.constant.int 6 | |
%953 = torch.prims.convert_element_type %952, %int6_208 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[128],f32> | |
%int1_209 = torch.constant.int 1 | |
%954 = torch.aten.unsqueeze %924, %int1_209 : !torch.vtensor<[131072],si64>, !torch.int -> !torch.vtensor<[131072,1],si64> | |
%int6_210 = torch.constant.int 6 | |
%955 = torch.prims.convert_element_type %954, %int6_210 : !torch.vtensor<[131072,1],si64>, !torch.int -> !torch.vtensor<[131072,1],f32> | |
%int0_211 = torch.constant.int 0 | |
%956 = torch.aten.unsqueeze %953, %int0_211 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[1,128],f32> | |
%int6_212 = torch.constant.int 6 | |
%957 = torch.prims.convert_element_type %956, %int6_212 : !torch.vtensor<[1,128],f32>, !torch.int -> !torch.vtensor<[1,128],f32> | |
%958 = torch.aten.mul.Tensor %955, %957 : !torch.vtensor<[131072,1],f32>, !torch.vtensor<[1,128],f32> -> !torch.vtensor<[131072,128],f32> | |
%959 = torch.aten.cos %958 : !torch.vtensor<[131072,128],f32> -> !torch.vtensor<[131072,128],f32> | |
%int15_213 = torch.constant.int 15 | |
%960 = torch.prims.convert_element_type %959, %int15_213 : !torch.vtensor<[131072,128],f32>, !torch.int -> !torch.vtensor<[131072,128],bf16> | |
%961 = torch.aten.sin %958 : !torch.vtensor<[131072,128],f32> -> !torch.vtensor<[131072,128],f32> | |
%int15_214 = torch.constant.int 15 | |
%962 = torch.prims.convert_element_type %961, %int15_214 : !torch.vtensor<[131072,128],f32>, !torch.int -> !torch.vtensor<[131072,128],bf16> | |
%int0_215 = torch.constant.int 0 | |
%int0_216 = torch.constant.int 0 | |
%int1_217 = torch.constant.int 1 | |
%963 = torch.aten.slice.Tensor %960, %int0_215, %int0_216, %777, %int1_217 : !torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16> | |
torch.bind_symbolic_shape %963, [%773], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16> | |
%int1_218 = torch.constant.int 1 | |
%int0_219 = torch.constant.int 0 | |
%int9223372036854775807_220 = torch.constant.int 9223372036854775807 | |
%int1_221 = torch.constant.int 1 | |
%964 = torch.aten.slice.Tensor %963, %int1_218, %int0_219, %int9223372036854775807_220, %int1_221 : !torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16> | |
torch.bind_symbolic_shape %964, [%773], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16> | |
%int0_222 = torch.constant.int 0 | |
%int0_223 = torch.constant.int 0 | |
%int1_224 = torch.constant.int 1 | |
%965 = torch.aten.slice.Tensor %962, %int0_222, %int0_223, %777, %int1_224 : !torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16> | |
torch.bind_symbolic_shape %965, [%773], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16> | |
%int1_225 = torch.constant.int 1 | |
%int0_226 = torch.constant.int 0 | |
%int9223372036854775807_227 = torch.constant.int 9223372036854775807 | |
%int1_228 = torch.constant.int 1 | |
%966 = torch.aten.slice.Tensor %965, %int1_225, %int0_226, %int9223372036854775807_227, %int1_228 : !torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16> | |
torch.bind_symbolic_shape %966, [%773], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16> | |
%int0_229 = torch.constant.int 0 | |
%967 = torch.aten.unsqueeze %964, %int0_229 : !torch.vtensor<[?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,128],bf16> | |
torch.bind_symbolic_shape %967, [%773], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16> | |
%int1_230 = torch.constant.int 1 | |
%int0_231 = torch.constant.int 0 | |
%int9223372036854775807_232 = torch.constant.int 9223372036854775807 | |
%int1_233 = torch.constant.int 1 | |
%968 = torch.aten.slice.Tensor %967, %int1_230, %int0_231, %int9223372036854775807_232, %int1_233 : !torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],bf16> | |
torch.bind_symbolic_shape %968, [%773], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16> | |
%int2_234 = torch.constant.int 2 | |
%969 = torch.aten.unsqueeze %968, %int2_234 : !torch.vtensor<[1,?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,1,128],bf16> | |
torch.bind_symbolic_shape %969, [%773], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16> | |
%int3_235 = torch.constant.int 3 | |
%int0_236 = torch.constant.int 0 | |
%int9223372036854775807_237 = torch.constant.int 9223372036854775807 | |
%int1_238 = torch.constant.int 1 | |
%970 = torch.aten.slice.Tensor %969, %int3_235, %int0_236, %int9223372036854775807_237, %int1_238 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,1,128],bf16> | |
torch.bind_symbolic_shape %970, [%773], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16> | |
%int4_239 = torch.constant.int 4 | |
%int1_240 = torch.constant.int 1 | |
%int1_241 = torch.constant.int 1 | |
%int1_242 = torch.constant.int 1 | |
%971 = torch.prim.ListConstruct %int4_239, %int1_240, %int1_241, %int1_242 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%972 = torch.aten.repeat %970, %971 : !torch.vtensor<[1,?,1,128],bf16>, !torch.list<int> -> !torch.vtensor<[4,?,1,128],bf16> | |
torch.bind_symbolic_shape %972, [%773], affine_map<()[s0] -> (4, s0 * 32, 1, 128)> : !torch.vtensor<[4,?,1,128],bf16> | |
%int0_243 = torch.constant.int 0 | |
%973 = torch.aten.unsqueeze %966, %int0_243 : !torch.vtensor<[?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,128],bf16> | |
torch.bind_symbolic_shape %973, [%773], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16> | |
%int1_244 = torch.constant.int 1 | |
%int0_245 = torch.constant.int 0 | |
%int9223372036854775807_246 = torch.constant.int 9223372036854775807 | |
%int1_247 = torch.constant.int 1 | |
%974 = torch.aten.slice.Tensor %973, %int1_244, %int0_245, %int9223372036854775807_246, %int1_247 : !torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],bf16> | |
torch.bind_symbolic_shape %974, [%773], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16> | |
%int2_248 = torch.constant.int 2 | |
%975 = torch.aten.unsqueeze %974, %int2_248 : !torch.vtensor<[1,?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,1,128],bf16> | |
torch.bind_symbolic_shape %975, [%773], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16> | |
%int3_249 = torch.constant.int 3 | |
%int0_250 = torch.constant.int 0 | |
%int9223372036854775807_251 = torch.constant.int 9223372036854775807 | |
%int1_252 = torch.constant.int 1 | |
%976 = torch.aten.slice.Tensor %975, %int3_249, %int0_250, %int9223372036854775807_251, %int1_252 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,1,128],bf16> | |
torch.bind_symbolic_shape %976, [%773], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16> | |
%int4_253 = torch.constant.int 4 | |
%int1_254 = torch.constant.int 1 | |
%int1_255 = torch.constant.int 1 | |
%int1_256 = torch.constant.int 1 | |
%977 = torch.prim.ListConstruct %int4_253, %int1_254, %int1_255, %int1_256 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%978 = torch.aten.repeat %976, %977 : !torch.vtensor<[1,?,1,128],bf16>, !torch.list<int> -> !torch.vtensor<[4,?,1,128],bf16> | |
torch.bind_symbolic_shape %978, [%773], affine_map<()[s0] -> (4, s0 * 32, 1, 128)> : !torch.vtensor<[4,?,1,128],bf16> | |
%979 = torch.aten.mul.Tensor %858, %972 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16> -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %979, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 128)> : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ> | |
%int3_257 = torch.constant.int 3 | |
%int0_258 = torch.constant.int 0 | |
%int64_259 = torch.constant.int 64 | |
%int1_260 = torch.constant.int 1 | |
%980 = torch.aten.slice.Tensor %858, %int3_257, %int0_258, %int64_259, %int1_260 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %980, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 64)> : !torch.vtensor<[4,?,8,64],f8E4M3FNUZ> | |
%int3_261 = torch.constant.int 3 | |
%int64_262 = torch.constant.int 64 | |
%int9223372036854775807_263 = torch.constant.int 9223372036854775807 | |
%int1_264 = torch.constant.int 1 | |
%981 = torch.aten.slice.Tensor %858, %int3_261, %int64_262, %int9223372036854775807_263, %int1_264 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %981, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 64)> : !torch.vtensor<[4,?,8,64],f8E4M3FNUZ> | |
%982 = torch.aten.neg %981 : !torch.vtensor<[4,?,8,64],f8E4M3FNUZ> -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %982, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 64)> : !torch.vtensor<[4,?,8,64],f8E4M3FNUZ> | |
%983 = torch.prim.ListConstruct %982, %980 : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>) -> !torch.list<vtensor> | |
%int-1_265 = torch.constant.int -1 | |
%984 = torch.aten.cat %983, %int-1_265 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %984, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 128)> : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ> | |
%985 = torch.aten.mul.Tensor %984, %978 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16> -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %985, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 128)> : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ> | |
%int1_266 = torch.constant.int 1 | |
%986 = torch.aten.add.Tensor %979, %985, %int1_266 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %986, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 128)> : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ> | |
%int32_267 = torch.constant.int 32 | |
%int2_268 = torch.constant.int 2 | |
%int32_269 = torch.constant.int 32 | |
%int8_270 = torch.constant.int 8 | |
%int128_271 = torch.constant.int 128 | |
%987 = torch.prim.ListConstruct %776, %int32_267, %int2_268, %int32_269, %int8_270, %int128_271 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%988 = torch.aten.view %771, %987 : !torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %988, [%774], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ> | |
%int32_272 = torch.constant.int 32 | |
%989 = torch.aten.mul.int %776, %int32_272 : !torch.int, !torch.int -> !torch.int | |
%int2_273 = torch.constant.int 2 | |
%990 = torch.aten.mul.int %989, %int2_273 : !torch.int, !torch.int -> !torch.int | |
%int32_274 = torch.constant.int 32 | |
%int8_275 = torch.constant.int 8 | |
%int128_276 = torch.constant.int 128 | |
%991 = torch.prim.ListConstruct %990, %int32_274, %int8_275, %int128_276 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%992 = torch.aten.view %988, %991 : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %992, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
%int64_277 = torch.constant.int 64 | |
%993 = torch.aten.mul.Scalar %arg2, %int64_277 : !torch.vtensor<[4,?],si64>, !torch.int -> !torch.vtensor<[4,?],si64> | |
torch.bind_symbolic_shape %993, [%773], affine_map<()[s0] -> (4, s0)> : !torch.vtensor<[4,?],si64> | |
%int0_278 = torch.constant.int 0 | |
%int1_279 = torch.constant.int 1 | |
%994 = torch.aten.add.Scalar %993, %int0_278, %int1_279 : !torch.vtensor<[4,?],si64>, !torch.int, !torch.int -> !torch.vtensor<[4,?],si64> | |
torch.bind_symbolic_shape %994, [%773], affine_map<()[s0] -> (4, s0)> : !torch.vtensor<[4,?],si64> | |
%int4_280 = torch.constant.int 4 | |
%int32_281 = torch.constant.int 32 | |
%int8_282 = torch.constant.int 8 | |
%int128_283 = torch.constant.int 128 | |
%995 = torch.prim.ListConstruct %int4_280, %775, %int32_281, %int8_282, %int128_283 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%996 = torch.aten.view %986, %995 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %996, [%773], affine_map<()[s0] -> (4, s0, 32, 8, 128)> : !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ> | |
%int4_284 = torch.constant.int 4 | |
%997 = torch.aten.mul.int %int4_284, %775 : !torch.int, !torch.int -> !torch.int | |
%int32_285 = torch.constant.int 32 | |
%int8_286 = torch.constant.int 8 | |
%int128_287 = torch.constant.int 128 | |
%998 = torch.prim.ListConstruct %997, %int32_285, %int8_286, %int128_287 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%999 = torch.aten.view %996, %998 : !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %999, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
%1000 = torch.prim.ListConstruct %997 : (!torch.int) -> !torch.list<int> | |
%1001 = torch.aten.view %994, %1000 : !torch.vtensor<[4,?],si64>, !torch.list<int> -> !torch.vtensor<[?],si64> | |
torch.bind_symbolic_shape %1001, [%773], affine_map<()[s0] -> (s0 * 4)> : !torch.vtensor<[?],si64> | |
%int26_288 = torch.constant.int 26 | |
%1002 = torch.prims.convert_element_type %999, %int26_288 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1002, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
%int1_289 = torch.constant.int 1 | |
%1003 = torch.aten.view.dtype %992, %int1_289 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[?,32,8,128],si8> | |
torch.bind_symbolic_shape %1003, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8> | |
%1004 = torch.aten.detach %1003 : !torch.vtensor<[?,32,8,128],si8> -> !torch.vtensor<[?,32,8,128],si8> | |
torch.bind_symbolic_shape %1004, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8> | |
%1005 = torch.aten.detach %1004 : !torch.vtensor<[?,32,8,128],si8> -> !torch.vtensor<[?,32,8,128],si8> | |
torch.bind_symbolic_shape %1005, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8> | |
%int1_290 = torch.constant.int 1 | |
%1006 = torch.aten.view.dtype %1002, %int1_290 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[?,32,8,128],si8> | |
torch.bind_symbolic_shape %1006, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8> | |
%1007 = torch.aten.detach %1006 : !torch.vtensor<[?,32,8,128],si8> -> !torch.vtensor<[?,32,8,128],si8> | |
torch.bind_symbolic_shape %1007, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8> | |
%1008 = torch.aten.detach %1007 : !torch.vtensor<[?,32,8,128],si8> -> !torch.vtensor<[?,32,8,128],si8> | |
torch.bind_symbolic_shape %1008, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8> | |
%1009 = torch.prim.ListConstruct %1001 : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>> | |
%false_291 = torch.constant.bool false | |
%1010 = torch.aten.index_put %1005, %1009, %1008, %false_291 : !torch.vtensor<[?,32,8,128],si8>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],si8>, !torch.bool -> !torch.vtensor<[?,32,8,128],si8> | |
torch.bind_symbolic_shape %1010, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8> | |
%int26_292 = torch.constant.int 26 | |
%1011 = torch.aten.view.dtype %1010, %int26_292 : !torch.vtensor<[?,32,8,128],si8>, !torch.int -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1011, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
%1012 = torch.aten.detach %1011 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1012, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
%1013 = torch.aten.detach %1012 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1013, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
%int32_293 = torch.constant.int 32 | |
%int2_294 = torch.constant.int 2 | |
%int32_295 = torch.constant.int 32 | |
%int8_296 = torch.constant.int 8 | |
%int128_297 = torch.constant.int 128 | |
%1014 = torch.prim.ListConstruct %776, %int32_293, %int2_294, %int32_295, %int8_296, %int128_297 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%1015 = torch.aten.view %1013, %1014 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1015, [%774], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ> | |
%int2097152 = torch.constant.int 2097152 | |
%1016 = torch.prim.ListConstruct %776, %int2097152 : (!torch.int, !torch.int) -> !torch.list<int> | |
%1017 = torch.aten.view %1015, %1016 : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,2097152],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1017, [%774], affine_map<()[s0] -> (s0, 2097152)> : !torch.vtensor<[?,2097152],f8E4M3FNUZ> | |
%int4_298 = torch.constant.int 4 | |
%int32_299 = torch.constant.int 32 | |
%int8_300 = torch.constant.int 8 | |
%int128_301 = torch.constant.int 128 | |
%1018 = torch.prim.ListConstruct %int4_298, %775, %int32_299, %int8_300, %int128_301 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%1019 = torch.aten.view %860, %1018 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1019, [%773], affine_map<()[s0] -> (4, s0, 32, 8, 128)> : !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ> | |
%int32_302 = torch.constant.int 32 | |
%int8_303 = torch.constant.int 8 | |
%int128_304 = torch.constant.int 128 | |
%1020 = torch.prim.ListConstruct %997, %int32_302, %int8_303, %int128_304 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%1021 = torch.aten.view %1019, %1020 : !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1021, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
%int1_305 = torch.constant.int 1 | |
%int1_306 = torch.constant.int 1 | |
%1022 = torch.aten.add.Scalar %994, %int1_305, %int1_306 : !torch.vtensor<[4,?],si64>, !torch.int, !torch.int -> !torch.vtensor<[4,?],si64> | |
torch.bind_symbolic_shape %1022, [%773], affine_map<()[s0] -> (4, s0)> : !torch.vtensor<[4,?],si64> | |
%1023 = torch.prim.ListConstruct %997 : (!torch.int) -> !torch.list<int> | |
%1024 = torch.aten.view %1022, %1023 : !torch.vtensor<[4,?],si64>, !torch.list<int> -> !torch.vtensor<[?],si64> | |
torch.bind_symbolic_shape %1024, [%773], affine_map<()[s0] -> (s0 * 4)> : !torch.vtensor<[?],si64> | |
%int26_307 = torch.constant.int 26 | |
%1025 = torch.prims.convert_element_type %1021, %int26_307 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1025, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
%int1_308 = torch.constant.int 1 | |
%1026 = torch.aten.view.dtype %1025, %int1_308 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[?,32,8,128],si8> | |
torch.bind_symbolic_shape %1026, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8> | |
%1027 = torch.aten.detach %1026 : !torch.vtensor<[?,32,8,128],si8> -> !torch.vtensor<[?,32,8,128],si8> | |
torch.bind_symbolic_shape %1027, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8> | |
%1028 = torch.aten.detach %1027 : !torch.vtensor<[?,32,8,128],si8> -> !torch.vtensor<[?,32,8,128],si8> | |
torch.bind_symbolic_shape %1028, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8> | |
%int32_309 = torch.constant.int 32 | |
%int2_310 = torch.constant.int 2 | |
%int32_311 = torch.constant.int 32 | |
%int8_312 = torch.constant.int 8 | |
%int128_313 = torch.constant.int 128 | |
%1029 = torch.prim.ListConstruct %776, %int32_309, %int2_310, %int32_311, %int8_312, %int128_313 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%1030 = torch.aten.view %1017, %1029 : !torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1030, [%774], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ> | |
%int32_314 = torch.constant.int 32 | |
%int8_315 = torch.constant.int 8 | |
%int128_316 = torch.constant.int 128 | |
%1031 = torch.prim.ListConstruct %990, %int32_314, %int8_315, %int128_316 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%1032 = torch.aten.view %1030, %1031 : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1032, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
%int1_317 = torch.constant.int 1 | |
%1033 = torch.aten.view.dtype %1032, %int1_317 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[?,32,8,128],si8> | |
torch.bind_symbolic_shape %1033, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8> | |
%1034 = torch.aten.detach %1033 : !torch.vtensor<[?,32,8,128],si8> -> !torch.vtensor<[?,32,8,128],si8> | |
torch.bind_symbolic_shape %1034, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8> | |
%1035 = torch.aten.detach %1034 : !torch.vtensor<[?,32,8,128],si8> -> !torch.vtensor<[?,32,8,128],si8> | |
torch.bind_symbolic_shape %1035, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8> | |
%1036 = torch.prim.ListConstruct %1024 : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>> | |
%false_318 = torch.constant.bool false | |
%1037 = torch.aten.index_put %1035, %1036, %1028, %false_318 : !torch.vtensor<[?,32,8,128],si8>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],si8>, !torch.bool -> !torch.vtensor<[?,32,8,128],si8> | |
torch.bind_symbolic_shape %1037, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8> | |
%int26_319 = torch.constant.int 26 | |
%1038 = torch.aten.view.dtype %1037, %int26_319 : !torch.vtensor<[?,32,8,128],si8>, !torch.int -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1038, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
%1039 = torch.aten.detach %1038 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1039, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
%1040 = torch.aten.detach %1039 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1040, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
%int32_320 = torch.constant.int 32 | |
%int2_321 = torch.constant.int 2 | |
%int32_322 = torch.constant.int 32 | |
%int8_323 = torch.constant.int 8 | |
%int128_324 = torch.constant.int 128 | |
%1041 = torch.prim.ListConstruct %776, %int32_320, %int2_321, %int32_322, %int8_323, %int128_324 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%1042 = torch.aten.view %1040, %1041 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1042, [%774], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ> | |
%int2097152_325 = torch.constant.int 2097152 | |
%1043 = torch.prim.ListConstruct %776, %int2097152_325 : (!torch.int, !torch.int) -> !torch.list<int> | |
%1044 = torch.aten.view %1042, %1043 : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,2097152],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1044, [%774], affine_map<()[s0] -> (s0, 2097152)> : !torch.vtensor<[?,2097152],f8E4M3FNUZ> | |
%int-2 = torch.constant.int -2 | |
%1045 = torch.aten.unsqueeze %986, %int-2 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1045, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 1, 128)> : !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ> | |
%int4_326 = torch.constant.int 4 | |
%int8_327 = torch.constant.int 8 | |
%int4_328 = torch.constant.int 4 | |
%int128_329 = torch.constant.int 128 | |
%1046 = torch.prim.ListConstruct %int4_326, %777, %int8_327, %int4_328, %int128_329 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%false_330 = torch.constant.bool false | |
%1047 = torch.aten.expand %1045, %1046, %false_330 : !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1047, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 4, 128)> : !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ> | |
%int0_331 = torch.constant.int 0 | |
%1048 = torch.aten.clone %1047, %int0_331 : !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1048, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 4, 128)> : !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ> | |
%int4_332 = torch.constant.int 4 | |
%int32_333 = torch.constant.int 32 | |
%int128_334 = torch.constant.int 128 | |
%1049 = torch.prim.ListConstruct %int4_332, %777, %int32_333, %int128_334 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%1050 = torch.aten._unsafe_view %1048, %1049 : !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1050, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ> | |
%int-2_335 = torch.constant.int -2 | |
%1051 = torch.aten.unsqueeze %860, %int-2_335 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1051, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 1, 128)> : !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ> | |
%int4_336 = torch.constant.int 4 | |
%int8_337 = torch.constant.int 8 | |
%int4_338 = torch.constant.int 4 | |
%int128_339 = torch.constant.int 128 | |
%1052 = torch.prim.ListConstruct %int4_336, %777, %int8_337, %int4_338, %int128_339 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%false_340 = torch.constant.bool false | |
%1053 = torch.aten.expand %1051, %1052, %false_340 : !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1053, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 4, 128)> : !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ> | |
%int0_341 = torch.constant.int 0 | |
%1054 = torch.aten.clone %1053, %int0_341 : !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1054, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 4, 128)> : !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ> | |
%int4_342 = torch.constant.int 4 | |
%int32_343 = torch.constant.int 32 | |
%int128_344 = torch.constant.int 128 | |
%1055 = torch.prim.ListConstruct %int4_342, %777, %int32_343, %int128_344 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%1056 = torch.aten._unsafe_view %1054, %1055 : !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1056, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ> | |
%int1_345 = torch.constant.int 1 | |
%int2_346 = torch.constant.int 2 | |
%1057 = torch.aten.transpose.int %923, %int1_345, %int2_346 : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1057, [%773], affine_map<()[s0] -> (4, 32, s0 * 32, 128)> : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> | |
%int1_347 = torch.constant.int 1 | |
%int2_348 = torch.constant.int 2 | |
%1058 = torch.aten.transpose.int %1050, %int1_347, %int2_348 : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1058, [%773], affine_map<()[s0] -> (4, 32, s0 * 32, 128)> : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> | |
%int1_349 = torch.constant.int 1 | |
%int2_350 = torch.constant.int 2 | |
%1059 = torch.aten.transpose.int %1056, %int1_349, %int2_350 : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1059, [%773], affine_map<()[s0] -> (4, 32, s0 * 32, 128)> : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> | |
%int26_351 = torch.constant.int 26 | |
%1060 = torch.prims.convert_element_type %1057, %int26_351 : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1060, [%773], affine_map<()[s0] -> (4, 32, s0 * 32, 128)> : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> | |
%int26_352 = torch.constant.int 26 | |
%1061 = torch.prims.convert_element_type %1058, %int26_352 : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1061, [%773], affine_map<()[s0] -> (4, 32, s0 * 32, 128)> : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> | |
%int26_353 = torch.constant.int 26 | |
%1062 = torch.prims.convert_element_type %1059, %int26_353 : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1062, [%773], affine_map<()[s0] -> (4, 32, s0 * 32, 128)> : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> | |
%int26_354 = torch.constant.int 26 | |
%1063 = torch.prims.convert_element_type %803, %int26_354 : !torch.vtensor<[4,1,?,?],f32>, !torch.int -> !torch.vtensor<[4,1,?,?],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1063, [%773], affine_map<()[s0] -> (4, 1, s0 * 32, s0 * 32)> : !torch.vtensor<[4,1,?,?],f8E4M3FNUZ> | |
%int0_355 = torch.constant.int 0 | |
%int0_356 = torch.constant.int 0 | |
%1064 = torch.aten.select.int %1063, %int0_355, %int0_356 : !torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[1,?,?],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1064, [%773], affine_map<()[s0] -> (1, s0 * 32, s0 * 32)> : !torch.vtensor<[1,?,?],f8E4M3FNUZ> | |
%int0_357 = torch.constant.int 0 | |
%int0_358 = torch.constant.int 0 | |
%1065 = torch.aten.select.int %1064, %int0_357, %int0_358 : !torch.vtensor<[1,?,?],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[?,?],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1065, [%773], affine_map<()[s0] -> (s0 * 32, s0 * 32)> : !torch.vtensor<[?,?],f8E4M3FNUZ> | |
%int0_359 = torch.constant.int 0 | |
%int0_360 = torch.constant.int 0 | |
%int9223372036854775807_361 = torch.constant.int 9223372036854775807 | |
%int1_362 = torch.constant.int 1 | |
%1066 = torch.aten.slice.Tensor %1065, %int0_359, %int0_360, %int9223372036854775807_361, %int1_362 : !torch.vtensor<[?,?],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,?],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1066, [%773], affine_map<()[s0] -> (s0 * 32, s0 * 32)> : !torch.vtensor<[?,?],f8E4M3FNUZ> | |
%int1_363 = torch.constant.int 1 | |
%int0_364 = torch.constant.int 0 | |
%int9223372036854775807_365 = torch.constant.int 9223372036854775807 | |
%int1_366 = torch.constant.int 1 | |
%1067 = torch.aten.slice.Tensor %1066, %int1_363, %int0_364, %int9223372036854775807_365, %int1_366 : !torch.vtensor<[?,?],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,?],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1067, [%773], affine_map<()[s0] -> (s0 * 32, s0 * 32)> : !torch.vtensor<[?,?],f8E4M3FNUZ> | |
%none_367 = torch.constant.none | |
%1068 = torch.aten.clone %11, %none_367 : !torch.vtensor<[],f32>, !torch.none -> !torch.vtensor<[],f32> | |
%1069 = torch.aten.detach %1068 : !torch.vtensor<[],f32> -> !torch.vtensor<[],f32> | |
%1070 = torch.aten.detach %1069 : !torch.vtensor<[],f32> -> !torch.vtensor<[],f32> | |
%1071 = torch.aten.detach %1070 : !torch.vtensor<[],f32> -> !torch.vtensor<[],f32> | |
%1072 = torch_c.to_builtin_tensor %1060 : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> -> tensor<4x32x?x128xf8E4M3FNUZ> | |
%1073 = torch_c.to_builtin_tensor %1061 : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> -> tensor<4x32x?x128xf8E4M3FNUZ> | |
%1074 = torch_c.to_builtin_tensor %1062 : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> -> tensor<4x32x?x128xf8E4M3FNUZ> | |
%1075 = torch_c.to_builtin_tensor %1067 : !torch.vtensor<[?,?],f8E4M3FNUZ> -> tensor<?x?xf8E4M3FNUZ> | |
%1076 = torch_c.to_builtin_tensor %1071 : !torch.vtensor<[],f32> -> tensor<f32> | |
%1077 = util.call @sharktank_masked_flash_attention_4_32_128_128_f8E4M3FNUZ_f8E4M3FNUZ_f32_f32(%1072, %1073, %1074, %1076, %1075) : (tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<f32>, tensor<?x?xf8E4M3FNUZ>) -> tensor<4x32x?x128xf32> | |
%1078 = torch_c.from_builtin_tensor %1077 : tensor<4x32x?x128xf32> -> !torch.vtensor<[4,32,?,128],f32> | |
torch.bind_symbolic_shape %1078, [%773], affine_map<()[s0] -> (4, 32, s0 * 32, 128)> : !torch.vtensor<[4,32,?,128],f32> | |
%int1_368 = torch.constant.int 1 | |
%int2_369 = torch.constant.int 2 | |
%1079 = torch.aten.transpose.int %1078, %int1_368, %int2_369 : !torch.vtensor<[4,32,?,128],f32>, !torch.int, !torch.int -> !torch.vtensor<[4,?,32,128],f32> | |
torch.bind_symbolic_shape %1079, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f32> | |
%int0_370 = torch.constant.int 0 | |
%1080 = torch.aten.clone %1079, %int0_370 : !torch.vtensor<[4,?,32,128],f32>, !torch.int -> !torch.vtensor<[4,?,32,128],f32> | |
torch.bind_symbolic_shape %1080, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f32> | |
%int4_371 = torch.constant.int 4 | |
%int4096_372 = torch.constant.int 4096 | |
%1081 = torch.prim.ListConstruct %int4_371, %777, %int4096_372 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%1082 = torch.aten._unsafe_view %1080, %1081 : !torch.vtensor<[4,?,32,128],f32>, !torch.list<int> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %1082, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%1083 = torch.aten.div.Tensor %1082, %12 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %1083, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%float-2.400000e02_373 = torch.constant.float -2.400000e+02 | |
%float2.400000e02_374 = torch.constant.float 2.400000e+02 | |
%1084 = torch.aten.clamp %1083, %float-2.400000e02_373, %float2.400000e02_374 : !torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %1084, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int26_375 = torch.constant.int 26 | |
%1085 = torch.prims.convert_element_type %1084, %int26_375 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1085, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> | |
%int0_376 = torch.constant.int 0 | |
%1086 = torch.aten.unsqueeze %13, %int0_376 : !torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ> | |
%int4_377 = torch.constant.int 4 | |
%int4096_378 = torch.constant.int 4096 | |
%int4096_379 = torch.constant.int 4096 | |
%1087 = torch.prim.ListConstruct %int4_377, %int4096_378, %int4096_379 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%false_380 = torch.constant.bool false | |
%1088 = torch.aten.expand %1086, %1087, %false_380 : !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ> | |
%1089 = torch_c.to_builtin_tensor %1085 : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> -> tensor<4x?x4096xf8E4M3FNUZ> | |
%1090 = torch_c.to_builtin_tensor %1088 : !torch.vtensor<[4,4096,4096],f8E4M3FNUZ> -> tensor<4x4096x4096xf8E4M3FNUZ> | |
%1091 = util.call @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ_f32(%1089, %1090) : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x?x4096xf32> | |
%1092 = torch_c.from_builtin_tensor %1091 : tensor<4x?x4096xf32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %1092, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%1093 = torch.prim.ListConstruct : () -> !torch.list<int> | |
%1094 = torch.aten.permute %14, %1093 : !torch.vtensor<[],f32>, !torch.list<int> -> !torch.vtensor<[],f32> | |
%1095 = torch.aten.mul.Tensor %12, %1094 : !torch.vtensor<[],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[],f32> | |
%int6_381 = torch.constant.int 6 | |
%1096 = torch.prims.convert_element_type %1092, %int6_381 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %1096, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%1097 = torch.aten.mul.Tensor %1096, %1095 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %1097, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int1_382 = torch.constant.int 1 | |
%1098 = torch.aten.add.Tensor %805, %1097, %int1_382 : !torch.vtensor<[4,?,4096],bf16>, !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %1098, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int6_383 = torch.constant.int 6 | |
%1099 = torch.prims.convert_element_type %1098, %int6_383 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %1099, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int2_384 = torch.constant.int 2 | |
%1100 = torch.aten.pow.Tensor_Scalar %1099, %int2_384 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %1100, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int-1_385 = torch.constant.int -1 | |
%1101 = torch.prim.ListConstruct %int-1_385 : (!torch.int) -> !torch.list<int> | |
%true_386 = torch.constant.bool true | |
%none_387 = torch.constant.none | |
%1102 = torch.aten.mean.dim %1100, %1101, %true_386, %none_387 : !torch.vtensor<[4,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[4,?,1],f32> | |
torch.bind_symbolic_shape %1102, [%773], affine_map<()[s0] -> (4, s0 * 32, 1)> : !torch.vtensor<[4,?,1],f32> | |
%float1.000000e-05_388 = torch.constant.float 1.000000e-05 | |
%int1_389 = torch.constant.int 1 | |
%1103 = torch.aten.add.Scalar %1102, %float1.000000e-05_388, %int1_389 : !torch.vtensor<[4,?,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[4,?,1],f32> | |
torch.bind_symbolic_shape %1103, [%773], affine_map<()[s0] -> (4, s0 * 32, 1)> : !torch.vtensor<[4,?,1],f32> | |
%1104 = torch.aten.rsqrt %1103 : !torch.vtensor<[4,?,1],f32> -> !torch.vtensor<[4,?,1],f32> | |
torch.bind_symbolic_shape %1104, [%773], affine_map<()[s0] -> (4, s0 * 32, 1)> : !torch.vtensor<[4,?,1],f32> | |
%1105 = torch.aten.mul.Tensor %1099, %1104 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,1],f32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %1105, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int6_390 = torch.constant.int 6 | |
%1106 = torch.prims.convert_element_type %1105, %int6_390 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %1106, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%1107 = torch.aten.mul.Tensor %15, %1106 : !torch.vtensor<[4096],bf16>, !torch.vtensor<[4,?,4096],f32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %1107, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int6_391 = torch.constant.int 6 | |
%1108 = torch.prims.convert_element_type %1107, %int6_391 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %1108, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%1109 = torch.aten.div.Tensor %1108, %16 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %1109, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%float-2.400000e02_392 = torch.constant.float -2.400000e+02 | |
%float2.400000e02_393 = torch.constant.float 2.400000e+02 | |
%1110 = torch.aten.clamp %1109, %float-2.400000e02_392, %float2.400000e02_393 : !torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %1110, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int26_394 = torch.constant.int 26 | |
%1111 = torch.prims.convert_element_type %1110, %int26_394 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1111, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> | |
%int0_395 = torch.constant.int 0 | |
%1112 = torch.aten.unsqueeze %17, %int0_395 : !torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ> | |
%int4_396 = torch.constant.int 4 | |
%int14336 = torch.constant.int 14336 | |
%int4096_397 = torch.constant.int 4096 | |
%1113 = torch.prim.ListConstruct %int4_396, %int14336, %int4096_397 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%false_398 = torch.constant.bool false | |
%1114 = torch.aten.expand %1112, %1113, %false_398 : !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ> | |
%1115 = torch_c.to_builtin_tensor %1111 : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> -> tensor<4x?x4096xf8E4M3FNUZ> | |
%1116 = torch_c.to_builtin_tensor %1114 : !torch.vtensor<[4,14336,4096],f8E4M3FNUZ> -> tensor<4x14336x4096xf8E4M3FNUZ> | |
%1117 = util.call @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ_f32(%1115, %1116) : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x?x14336xf32> | |
%1118 = torch_c.from_builtin_tensor %1117 : tensor<4x?x14336xf32> -> !torch.vtensor<[4,?,14336],f32> | |
torch.bind_symbolic_shape %1118, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32> | |
%1119 = torch.prim.ListConstruct : () -> !torch.list<int> | |
%1120 = torch.aten.permute %18, %1119 : !torch.vtensor<[],f32>, !torch.list<int> -> !torch.vtensor<[],f32> | |
%1121 = torch.aten.mul.Tensor %16, %1120 : !torch.vtensor<[],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[],f32> | |
%int6_399 = torch.constant.int 6 | |
%1122 = torch.prims.convert_element_type %1118, %int6_399 : !torch.vtensor<[4,?,14336],f32>, !torch.int -> !torch.vtensor<[4,?,14336],f32> | |
torch.bind_symbolic_shape %1122, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32> | |
%1123 = torch.aten.mul.Tensor %1122, %1121 : !torch.vtensor<[4,?,14336],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,14336],f32> | |
torch.bind_symbolic_shape %1123, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32> | |
%1124 = torch.aten.silu %1123 : !torch.vtensor<[4,?,14336],f32> -> !torch.vtensor<[4,?,14336],f32> | |
torch.bind_symbolic_shape %1124, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32> | |
%1125 = torch.aten.div.Tensor %1108, %19 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %1125, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%float-2.400000e02_400 = torch.constant.float -2.400000e+02 | |
%float2.400000e02_401 = torch.constant.float 2.400000e+02 | |
%1126 = torch.aten.clamp %1125, %float-2.400000e02_400, %float2.400000e02_401 : !torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %1126, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int26_402 = torch.constant.int 26 | |
%1127 = torch.prims.convert_element_type %1126, %int26_402 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1127, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> | |
%int0_403 = torch.constant.int 0 | |
%1128 = torch.aten.unsqueeze %20, %int0_403 : !torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ> | |
%int4_404 = torch.constant.int 4 | |
%int14336_405 = torch.constant.int 14336 | |
%int4096_406 = torch.constant.int 4096 | |
%1129 = torch.prim.ListConstruct %int4_404, %int14336_405, %int4096_406 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%false_407 = torch.constant.bool false | |
%1130 = torch.aten.expand %1128, %1129, %false_407 : !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ> | |
%1131 = torch_c.to_builtin_tensor %1127 : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> -> tensor<4x?x4096xf8E4M3FNUZ> | |
%1132 = torch_c.to_builtin_tensor %1130 : !torch.vtensor<[4,14336,4096],f8E4M3FNUZ> -> tensor<4x14336x4096xf8E4M3FNUZ> | |
%1133 = util.call @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ_f32(%1131, %1132) : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x?x14336xf32> | |
%1134 = torch_c.from_builtin_tensor %1133 : tensor<4x?x14336xf32> -> !torch.vtensor<[4,?,14336],f32> | |
torch.bind_symbolic_shape %1134, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32> | |
%1135 = torch.prim.ListConstruct : () -> !torch.list<int> | |
%1136 = torch.aten.permute %21, %1135 : !torch.vtensor<[],f32>, !torch.list<int> -> !torch.vtensor<[],f32> | |
%1137 = torch.aten.mul.Tensor %19, %1136 : !torch.vtensor<[],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[],f32> | |
%int6_408 = torch.constant.int 6 | |
%1138 = torch.prims.convert_element_type %1134, %int6_408 : !torch.vtensor<[4,?,14336],f32>, !torch.int -> !torch.vtensor<[4,?,14336],f32> | |
torch.bind_symbolic_shape %1138, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32> | |
%1139 = torch.aten.mul.Tensor %1138, %1137 : !torch.vtensor<[4,?,14336],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,14336],f32> | |
torch.bind_symbolic_shape %1139, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32> | |
%1140 = torch.aten.mul.Tensor %1124, %1139 : !torch.vtensor<[4,?,14336],f32>, !torch.vtensor<[4,?,14336],f32> -> !torch.vtensor<[4,?,14336],f32> | |
torch.bind_symbolic_shape %1140, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32> | |
%1141 = torch.aten.div.Tensor %1140, %22 : !torch.vtensor<[4,?,14336],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,14336],f32> | |
torch.bind_symbolic_shape %1141, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32> | |
%float-2.400000e02_409 = torch.constant.float -2.400000e+02 | |
%float2.400000e02_410 = torch.constant.float 2.400000e+02 | |
%1142 = torch.aten.clamp %1141, %float-2.400000e02_409, %float2.400000e02_410 : !torch.vtensor<[4,?,14336],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,14336],f32> | |
torch.bind_symbolic_shape %1142, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32> | |
%int26_411 = torch.constant.int 26 | |
%1143 = torch.prims.convert_element_type %1142, %int26_411 : !torch.vtensor<[4,?,14336],f32>, !torch.int -> !torch.vtensor<[4,?,14336],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1143, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f8E4M3FNUZ> | |
%int0_412 = torch.constant.int 0 | |
%1144 = torch.aten.unsqueeze %23, %int0_412 : !torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,4096,14336],f8E4M3FNUZ> | |
%int4_413 = torch.constant.int 4 | |
%int4096_414 = torch.constant.int 4096 | |
%int14336_415 = torch.constant.int 14336 | |
%1145 = torch.prim.ListConstruct %int4_413, %int4096_414, %int14336_415 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%false_416 = torch.constant.bool false | |
%1146 = torch.aten.expand %1144, %1145, %false_416 : !torch.vtensor<[1,4096,14336],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,4096,14336],f8E4M3FNUZ> | |
%1147 = torch_c.to_builtin_tensor %1143 : !torch.vtensor<[4,?,14336],f8E4M3FNUZ> -> tensor<4x?x14336xf8E4M3FNUZ> | |
%1148 = torch_c.to_builtin_tensor %1146 : !torch.vtensor<[4,4096,14336],f8E4M3FNUZ> -> tensor<4x4096x14336xf8E4M3FNUZ> | |
%1149 = util.call @sharktank_batch_matmul_transpose_b_L4xDx14336xf8E4M3FNUZ_R4x4096x14336xf8E4M3FNUZ_f32(%1147, %1148) : (tensor<4x?x14336xf8E4M3FNUZ>, tensor<4x4096x14336xf8E4M3FNUZ>) -> tensor<4x?x4096xf32> | |
%1150 = torch_c.from_builtin_tensor %1149 : tensor<4x?x4096xf32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %1150, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%1151 = torch.prim.ListConstruct : () -> !torch.list<int> | |
%1152 = torch.aten.permute %24, %1151 : !torch.vtensor<[],f32>, !torch.list<int> -> !torch.vtensor<[],f32> | |
%1153 = torch.aten.mul.Tensor %22, %1152 : !torch.vtensor<[],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[],f32> | |
%int6_417 = torch.constant.int 6 | |
%1154 = torch.prims.convert_element_type %1150, %int6_417 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %1154, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%1155 = torch.aten.mul.Tensor %1154, %1153 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %1155, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int1_418 = torch.constant.int 1 | |
%1156 = torch.aten.add.Tensor %1098, %1155, %int1_418 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %1156, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int6_419 = torch.constant.int 6 | |
%1157 = torch.prims.convert_element_type %1156, %int6_419 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %1157, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int2_420 = torch.constant.int 2 | |
%1158 = torch.aten.pow.Tensor_Scalar %1157, %int2_420 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %1158, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int-1_421 = torch.constant.int -1 | |
%1159 = torch.prim.ListConstruct %int-1_421 : (!torch.int) -> !torch.list<int> | |
%true_422 = torch.constant.bool true | |
%none_423 = torch.constant.none | |
%1160 = torch.aten.mean.dim %1158, %1159, %true_422, %none_423 : !torch.vtensor<[4,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[4,?,1],f32> | |
torch.bind_symbolic_shape %1160, [%773], affine_map<()[s0] -> (4, s0 * 32, 1)> : !torch.vtensor<[4,?,1],f32> | |
%float1.000000e-05_424 = torch.constant.float 1.000000e-05 | |
%int1_425 = torch.constant.int 1 | |
%1161 = torch.aten.add.Scalar %1160, %float1.000000e-05_424, %int1_425 : !torch.vtensor<[4,?,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[4,?,1],f32> | |
torch.bind_symbolic_shape %1161, [%773], affine_map<()[s0] -> (4, s0 * 32, 1)> : !torch.vtensor<[4,?,1],f32> | |
%1162 = torch.aten.rsqrt %1161 : !torch.vtensor<[4,?,1],f32> -> !torch.vtensor<[4,?,1],f32> | |
torch.bind_symbolic_shape %1162, [%773], affine_map<()[s0] -> (4, s0 * 32, 1)> : !torch.vtensor<[4,?,1],f32> | |
%1163 = torch.aten.mul.Tensor %1157, %1162 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,1],f32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %1163, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int6_426 = torch.constant.int 6 | |
%1164 = torch.prims.convert_element_type %1163, %int6_426 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %1164, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%1165 = torch.aten.mul.Tensor %25, %1164 : !torch.vtensor<[4096],bf16>, !torch.vtensor<[4,?,4096],f32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %1165, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int6_427 = torch.constant.int 6 | |
%1166 = torch.prims.convert_element_type %1165, %int6_427 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %1166, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%1167 = torch.aten.div.Tensor %1166, %26 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %1167, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%float-2.400000e02_428 = torch.constant.float -2.400000e+02 | |
%float2.400000e02_429 = torch.constant.float 2.400000e+02 | |
%1168 = torch.aten.clamp %1167, %float-2.400000e02_428, %float2.400000e02_429 : !torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %1168, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int26_430 = torch.constant.int 26 | |
%1169 = torch.prims.convert_element_type %1168, %int26_430 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1169, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> | |
%int0_431 = torch.constant.int 0 | |
%1170 = torch.aten.unsqueeze %27, %int0_431 : !torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ> | |
%int4_432 = torch.constant.int 4 | |
%int4096_433 = torch.constant.int 4096 | |
%int4096_434 = torch.constant.int 4096 | |
%1171 = torch.prim.ListConstruct %int4_432, %int4096_433, %int4096_434 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%false_435 = torch.constant.bool false | |
%1172 = torch.aten.expand %1170, %1171, %false_435 : !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ> | |
%1173 = torch_c.to_builtin_tensor %1169 : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> -> tensor<4x?x4096xf8E4M3FNUZ> | |
%1174 = torch_c.to_builtin_tensor %1172 : !torch.vtensor<[4,4096,4096],f8E4M3FNUZ> -> tensor<4x4096x4096xf8E4M3FNUZ> | |
%1175 = util.call @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ_f32(%1173, %1174) : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x?x4096xf32> | |
%1176 = torch_c.from_builtin_tensor %1175 : tensor<4x?x4096xf32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %1176, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%1177 = torch.aten.div.Tensor %1176, %28 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %1177, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%float-2.400000e02_436 = torch.constant.float -2.400000e+02 | |
%float2.400000e02_437 = torch.constant.float 2.400000e+02 | |
%1178 = torch.aten.clamp %1177, %float-2.400000e02_436, %float2.400000e02_437 : !torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %1178, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int26_438 = torch.constant.int 26 | |
%1179 = torch.prims.convert_element_type %1178, %int26_438 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1179, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> | |
%1180 = torch.aten.div.Tensor %1166, %29 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %1180, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%float-2.400000e02_439 = torch.constant.float -2.400000e+02 | |
%float2.400000e02_440 = torch.constant.float 2.400000e+02 | |
%1181 = torch.aten.clamp %1180, %float-2.400000e02_439, %float2.400000e02_440 : !torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %1181, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int26_441 = torch.constant.int 26 | |
%1182 = torch.prims.convert_element_type %1181, %int26_441 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1182, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> | |
%int0_442 = torch.constant.int 0 | |
%1183 = torch.aten.unsqueeze %30, %int0_442 : !torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ> | |
%int4_443 = torch.constant.int 4 | |
%int1024_444 = torch.constant.int 1024 | |
%int4096_445 = torch.constant.int 4096 | |
%1184 = torch.prim.ListConstruct %int4_443, %int1024_444, %int4096_445 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%false_446 = torch.constant.bool false | |
%1185 = torch.aten.expand %1183, %1184, %false_446 : !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ> | |
%1186 = torch_c.to_builtin_tensor %1182 : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> -> tensor<4x?x4096xf8E4M3FNUZ> | |
%1187 = torch_c.to_builtin_tensor %1185 : !torch.vtensor<[4,1024,4096],f8E4M3FNUZ> -> tensor<4x1024x4096xf8E4M3FNUZ> | |
%1188 = util.call @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ_f32(%1186, %1187) : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x?x1024xf32> | |
%1189 = torch_c.from_builtin_tensor %1188 : tensor<4x?x1024xf32> -> !torch.vtensor<[4,?,1024],f32> | |
torch.bind_symbolic_shape %1189, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f32> | |
%1190 = torch.aten.div.Tensor %1189, %31 : !torch.vtensor<[4,?,1024],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,1024],f32> | |
torch.bind_symbolic_shape %1190, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f32> | |
%float-2.400000e02_447 = torch.constant.float -2.400000e+02 | |
%float2.400000e02_448 = torch.constant.float 2.400000e+02 | |
%1191 = torch.aten.clamp %1190, %float-2.400000e02_447, %float2.400000e02_448 : !torch.vtensor<[4,?,1024],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,1024],f32> | |
torch.bind_symbolic_shape %1191, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f32> | |
%int26_449 = torch.constant.int 26 | |
%1192 = torch.prims.convert_element_type %1191, %int26_449 : !torch.vtensor<[4,?,1024],f32>, !torch.int -> !torch.vtensor<[4,?,1024],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1192, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f8E4M3FNUZ> | |
%1193 = torch.aten.div.Tensor %1166, %32 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %1193, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%float-2.400000e02_450 = torch.constant.float -2.400000e+02 | |
%float2.400000e02_451 = torch.constant.float 2.400000e+02 | |
%1194 = torch.aten.clamp %1193, %float-2.400000e02_450, %float2.400000e02_451 : !torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %1194, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int26_452 = torch.constant.int 26 | |
%1195 = torch.prims.convert_element_type %1194, %int26_452 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1195, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> | |
%int0_453 = torch.constant.int 0 | |
%1196 = torch.aten.unsqueeze %33, %int0_453 : !torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ> | |
%int4_454 = torch.constant.int 4 | |
%int1024_455 = torch.constant.int 1024 | |
%int4096_456 = torch.constant.int 4096 | |
%1197 = torch.prim.ListConstruct %int4_454, %int1024_455, %int4096_456 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%false_457 = torch.constant.bool false | |
%1198 = torch.aten.expand %1196, %1197, %false_457 : !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ> | |
%1199 = torch_c.to_builtin_tensor %1195 : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> -> tensor<4x?x4096xf8E4M3FNUZ> | |
%1200 = torch_c.to_builtin_tensor %1198 : !torch.vtensor<[4,1024,4096],f8E4M3FNUZ> -> tensor<4x1024x4096xf8E4M3FNUZ> | |
%1201 = util.call @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ_f32(%1199, %1200) : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x?x1024xf32> | |
%1202 = torch_c.from_builtin_tensor %1201 : tensor<4x?x1024xf32> -> !torch.vtensor<[4,?,1024],f32> | |
torch.bind_symbolic_shape %1202, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f32> | |
%1203 = torch.aten.div.Tensor %1202, %34 : !torch.vtensor<[4,?,1024],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,1024],f32> | |
torch.bind_symbolic_shape %1203, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f32> | |
%float-2.400000e02_458 = torch.constant.float -2.400000e+02 | |
%float2.400000e02_459 = torch.constant.float 2.400000e+02 | |
%1204 = torch.aten.clamp %1203, %float-2.400000e02_458, %float2.400000e02_459 : !torch.vtensor<[4,?,1024],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,1024],f32> | |
torch.bind_symbolic_shape %1204, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f32> | |
%int26_460 = torch.constant.int 26 | |
%1205 = torch.prims.convert_element_type %1204, %int26_460 : !torch.vtensor<[4,?,1024],f32>, !torch.int -> !torch.vtensor<[4,?,1024],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1205, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f8E4M3FNUZ> | |
%int4_461 = torch.constant.int 4 | |
%int32_462 = torch.constant.int 32 | |
%int128_463 = torch.constant.int 128 | |
%1206 = torch.prim.ListConstruct %int4_461, %777, %int32_462, %int128_463 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%1207 = torch.aten.view %1179, %1206 : !torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1207, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ> | |
%int4_464 = torch.constant.int 4 | |
%int8_465 = torch.constant.int 8 | |
%int128_466 = torch.constant.int 128 | |
%1208 = torch.prim.ListConstruct %int4_464, %777, %int8_465, %int128_466 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%1209 = torch.aten.view %1192, %1208 : !torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1209, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 128)> : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ> | |
%int4_467 = torch.constant.int 4 | |
%int8_468 = torch.constant.int 8 | |
%int128_469 = torch.constant.int 128 | |
%1210 = torch.prim.ListConstruct %int4_467, %777, %int8_468, %int128_469 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%1211 = torch.aten.view %1205, %1210 : !torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1211, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 128)> : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ> | |
%int131072_470 = torch.constant.int 131072 | |
%none_471 = torch.constant.none | |
%none_472 = torch.constant.none | |
%cpu_473 = torch.constant.device "cpu" | |
%false_474 = torch.constant.bool false | |
%1212 = torch.aten.arange %int131072_470, %none_471, %none_472, %cpu_473, %false_474 : !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[131072],si64> | |
%int0_475 = torch.constant.int 0 | |
%int128_476 = torch.constant.int 128 | |
%int2_477 = torch.constant.int 2 | |
%int4_478 = torch.constant.int 4 | |
%none_479 = torch.constant.none | |
%cpu_480 = torch.constant.device "cpu" | |
%false_481 = torch.constant.bool false | |
%1213 = torch.aten.arange.start_step %int0_475, %int128_476, %int2_477, %int4_478, %none_479, %cpu_480, %false_481 : !torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[64],si64> | |
%int6_482 = torch.constant.int 6 | |
%1214 = torch.prims.convert_element_type %1213, %int6_482 : !torch.vtensor<[64],si64>, !torch.int -> !torch.vtensor<[64],f32> | |
%int128_483 = torch.constant.int 128 | |
%1215 = torch.aten.div.Scalar %1214, %int128_483 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32> | |
%float5.000000e05_484 = torch.constant.float 5.000000e+05 | |
%1216 = torch.aten.pow.Scalar %float5.000000e05_484, %1215 : !torch.float, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32> | |
%1217 = torch.aten.reciprocal %1216 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32> | |
%float1.000000e00_485 = torch.constant.float 1.000000e+00 | |
%1218 = torch.aten.mul.Scalar %1217, %float1.000000e00_485 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],f32> | |
%1219 = torch.aten.reciprocal %1218 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32> | |
%float6.283190e00_486 = torch.constant.float 6.2831853071795862 | |
%1220 = torch.aten.mul.Scalar %1219, %float6.283190e00_486 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],f32> | |
%float8.192000e03_487 = torch.constant.float 8.192000e+03 | |
%1221 = torch.aten.gt.Scalar %1220, %float8.192000e03_487 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1> | |
%int8_488 = torch.constant.int 8 | |
%1222 = torch.aten.div.Scalar %1218, %int8_488 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32> | |
%1223 = torch.aten.where.self %1221, %1222, %1218 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32> | |
%1224 = torch.aten.reciprocal %1220 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32> | |
%int8192_489 = torch.constant.int 8192 | |
%1225 = torch.aten.mul.Scalar %1224, %int8192_489 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32> | |
%int1_490 = torch.constant.int 1 | |
%int1_491 = torch.constant.int 1 | |
%1226 = torch.aten.sub.Scalar %1225, %int1_490, %int1_491 : !torch.vtensor<[64],f32>, !torch.int, !torch.int -> !torch.vtensor<[64],f32> | |
%int3_492 = torch.constant.int 3 | |
%1227 = torch.aten.div.Scalar %1226, %int3_492 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32> | |
%int1_493 = torch.constant.int 1 | |
%int1_494 = torch.constant.int 1 | |
%1228 = torch.aten.rsub.Scalar %1227, %int1_493, %int1_494 : !torch.vtensor<[64],f32>, !torch.int, !torch.int -> !torch.vtensor<[64],f32> | |
%1229 = torch.aten.mul.Tensor %1228, %1223 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32> | |
%int8_495 = torch.constant.int 8 | |
%1230 = torch.aten.div.Scalar %1229, %int8_495 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32> | |
%1231 = torch.aten.mul.Tensor %1227, %1223 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32> | |
%int1_496 = torch.constant.int 1 | |
%1232 = torch.aten.add.Tensor %1230, %1231, %int1_496 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32> | |
%float2.048000e03_497 = torch.constant.float 2.048000e+03 | |
%1233 = torch.aten.lt.Scalar %1220, %float2.048000e03_497 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1> | |
%1234 = torch.aten.bitwise_not %1233 : !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1> | |
%float8.192000e03_498 = torch.constant.float 8.192000e+03 | |
%1235 = torch.aten.gt.Scalar %1220, %float8.192000e03_498 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1> | |
%1236 = torch.aten.bitwise_not %1235 : !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1> | |
%1237 = torch.aten.mul.Tensor %1234, %1236 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1> | |
%1238 = torch.aten.where.self %1237, %1232, %1223 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32> | |
%1239 = torch.prim.ListConstruct %1238, %1238 : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor> | |
%int-1_499 = torch.constant.int -1 | |
%1240 = torch.aten.cat %1239, %int-1_499 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[128],f32> | |
%int6_500 = torch.constant.int 6 | |
%1241 = torch.prims.convert_element_type %1240, %int6_500 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[128],f32> | |
%int1_501 = torch.constant.int 1 | |
%1242 = torch.aten.unsqueeze %1212, %int1_501 : !torch.vtensor<[131072],si64>, !torch.int -> !torch.vtensor<[131072,1],si64> | |
%int6_502 = torch.constant.int 6 | |
%1243 = torch.prims.convert_element_type %1242, %int6_502 : !torch.vtensor<[131072,1],si64>, !torch.int -> !torch.vtensor<[131072,1],f32> | |
%int0_503 = torch.constant.int 0 | |
%1244 = torch.aten.unsqueeze %1241, %int0_503 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[1,128],f32> | |
%int6_504 = torch.constant.int 6 | |
%1245 = torch.prims.convert_element_type %1244, %int6_504 : !torch.vtensor<[1,128],f32>, !torch.int -> !torch.vtensor<[1,128],f32> | |
%1246 = torch.aten.mul.Tensor %1243, %1245 : !torch.vtensor<[131072,1],f32>, !torch.vtensor<[1,128],f32> -> !torch.vtensor<[131072,128],f32> | |
%1247 = torch.aten.cos %1246 : !torch.vtensor<[131072,128],f32> -> !torch.vtensor<[131072,128],f32> | |
%int15_505 = torch.constant.int 15 | |
%1248 = torch.prims.convert_element_type %1247, %int15_505 : !torch.vtensor<[131072,128],f32>, !torch.int -> !torch.vtensor<[131072,128],bf16> | |
%1249 = torch.aten.sin %1246 : !torch.vtensor<[131072,128],f32> -> !torch.vtensor<[131072,128],f32> | |
%int15_506 = torch.constant.int 15 | |
%1250 = torch.prims.convert_element_type %1249, %int15_506 : !torch.vtensor<[131072,128],f32>, !torch.int -> !torch.vtensor<[131072,128],bf16> | |
%int0_507 = torch.constant.int 0 | |
%int0_508 = torch.constant.int 0 | |
%int1_509 = torch.constant.int 1 | |
%1251 = torch.aten.slice.Tensor %1248, %int0_507, %int0_508, %777, %int1_509 : !torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16> | |
torch.bind_symbolic_shape %1251, [%773], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16> | |
%int1_510 = torch.constant.int 1 | |
%int0_511 = torch.constant.int 0 | |
%int9223372036854775807_512 = torch.constant.int 9223372036854775807 | |
%int1_513 = torch.constant.int 1 | |
%1252 = torch.aten.slice.Tensor %1251, %int1_510, %int0_511, %int9223372036854775807_512, %int1_513 : !torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16> | |
torch.bind_symbolic_shape %1252, [%773], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16> | |
%int0_514 = torch.constant.int 0 | |
%int0_515 = torch.constant.int 0 | |
%int1_516 = torch.constant.int 1 | |
%1253 = torch.aten.slice.Tensor %1250, %int0_514, %int0_515, %777, %int1_516 : !torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16> | |
torch.bind_symbolic_shape %1253, [%773], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16> | |
%int1_517 = torch.constant.int 1 | |
%int0_518 = torch.constant.int 0 | |
%int9223372036854775807_519 = torch.constant.int 9223372036854775807 | |
%int1_520 = torch.constant.int 1 | |
%1254 = torch.aten.slice.Tensor %1253, %int1_517, %int0_518, %int9223372036854775807_519, %int1_520 : !torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16> | |
torch.bind_symbolic_shape %1254, [%773], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16> | |
%int0_521 = torch.constant.int 0 | |
%1255 = torch.aten.unsqueeze %1252, %int0_521 : !torch.vtensor<[?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,128],bf16> | |
torch.bind_symbolic_shape %1255, [%773], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16> | |
%int1_522 = torch.constant.int 1 | |
%int0_523 = torch.constant.int 0 | |
%int9223372036854775807_524 = torch.constant.int 9223372036854775807 | |
%int1_525 = torch.constant.int 1 | |
%1256 = torch.aten.slice.Tensor %1255, %int1_522, %int0_523, %int9223372036854775807_524, %int1_525 : !torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],bf16> | |
torch.bind_symbolic_shape %1256, [%773], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16> | |
%int2_526 = torch.constant.int 2 | |
%1257 = torch.aten.unsqueeze %1256, %int2_526 : !torch.vtensor<[1,?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,1,128],bf16> | |
torch.bind_symbolic_shape %1257, [%773], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16> | |
%int3_527 = torch.constant.int 3 | |
%int0_528 = torch.constant.int 0 | |
%int9223372036854775807_529 = torch.constant.int 9223372036854775807 | |
%int1_530 = torch.constant.int 1 | |
%1258 = torch.aten.slice.Tensor %1257, %int3_527, %int0_528, %int9223372036854775807_529, %int1_530 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,1,128],bf16> | |
torch.bind_symbolic_shape %1258, [%773], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16> | |
%int4_531 = torch.constant.int 4 | |
%int1_532 = torch.constant.int 1 | |
%int1_533 = torch.constant.int 1 | |
%int1_534 = torch.constant.int 1 | |
%1259 = torch.prim.ListConstruct %int4_531, %int1_532, %int1_533, %int1_534 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%1260 = torch.aten.repeat %1258, %1259 : !torch.vtensor<[1,?,1,128],bf16>, !torch.list<int> -> !torch.vtensor<[4,?,1,128],bf16> | |
torch.bind_symbolic_shape %1260, [%773], affine_map<()[s0] -> (4, s0 * 32, 1, 128)> : !torch.vtensor<[4,?,1,128],bf16> | |
%int0_535 = torch.constant.int 0 | |
%1261 = torch.aten.unsqueeze %1254, %int0_535 : !torch.vtensor<[?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,128],bf16> | |
torch.bind_symbolic_shape %1261, [%773], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16> | |
%int1_536 = torch.constant.int 1 | |
%int0_537 = torch.constant.int 0 | |
%int9223372036854775807_538 = torch.constant.int 9223372036854775807 | |
%int1_539 = torch.constant.int 1 | |
%1262 = torch.aten.slice.Tensor %1261, %int1_536, %int0_537, %int9223372036854775807_538, %int1_539 : !torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],bf16> | |
torch.bind_symbolic_shape %1262, [%773], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16> | |
%int2_540 = torch.constant.int 2 | |
%1263 = torch.aten.unsqueeze %1262, %int2_540 : !torch.vtensor<[1,?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,1,128],bf16> | |
torch.bind_symbolic_shape %1263, [%773], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16> | |
%int3_541 = torch.constant.int 3 | |
%int0_542 = torch.constant.int 0 | |
%int9223372036854775807_543 = torch.constant.int 9223372036854775807 | |
%int1_544 = torch.constant.int 1 | |
%1264 = torch.aten.slice.Tensor %1263, %int3_541, %int0_542, %int9223372036854775807_543, %int1_544 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,1,128],bf16> | |
torch.bind_symbolic_shape %1264, [%773], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16> | |
%int4_545 = torch.constant.int 4 | |
%int1_546 = torch.constant.int 1 | |
%int1_547 = torch.constant.int 1 | |
%int1_548 = torch.constant.int 1 | |
%1265 = torch.prim.ListConstruct %int4_545, %int1_546, %int1_547, %int1_548 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%1266 = torch.aten.repeat %1264, %1265 : !torch.vtensor<[1,?,1,128],bf16>, !torch.list<int> -> !torch.vtensor<[4,?,1,128],bf16> | |
torch.bind_symbolic_shape %1266, [%773], affine_map<()[s0] -> (4, s0 * 32, 1, 128)> : !torch.vtensor<[4,?,1,128],bf16> | |
%1267 = torch.aten.mul.Tensor %1207, %1260 : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16> -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1267, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ> | |
%int3_549 = torch.constant.int 3 | |
%int0_550 = torch.constant.int 0 | |
%int64_551 = torch.constant.int 64 | |
%int1_552 = torch.constant.int 1 | |
%1268 = torch.aten.slice.Tensor %1207, %int3_549, %int0_550, %int64_551, %int1_552 : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1268, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 64)> : !torch.vtensor<[4,?,32,64],f8E4M3FNUZ> | |
%int3_553 = torch.constant.int 3 | |
%int64_554 = torch.constant.int 64 | |
%int9223372036854775807_555 = torch.constant.int 9223372036854775807 | |
%int1_556 = torch.constant.int 1 | |
%1269 = torch.aten.slice.Tensor %1207, %int3_553, %int64_554, %int9223372036854775807_555, %int1_556 : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1269, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 64)> : !torch.vtensor<[4,?,32,64],f8E4M3FNUZ> | |
%1270 = torch.aten.neg %1269 : !torch.vtensor<[4,?,32,64],f8E4M3FNUZ> -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1270, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 64)> : !torch.vtensor<[4,?,32,64],f8E4M3FNUZ> | |
%1271 = torch.prim.ListConstruct %1270, %1268 : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>) -> !torch.list<vtensor> | |
%int-1_557 = torch.constant.int -1 | |
%1272 = torch.aten.cat %1271, %int-1_557 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1272, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ> | |
%1273 = torch.aten.mul.Tensor %1272, %1266 : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16> -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1273, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ> | |
%int1_558 = torch.constant.int 1 | |
%1274 = torch.aten.add.Tensor %1267, %1273, %int1_558 : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1274, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ> | |
%int131072_559 = torch.constant.int 131072 | |
%none_560 = torch.constant.none | |
%none_561 = torch.constant.none | |
%cpu_562 = torch.constant.device "cpu" | |
%false_563 = torch.constant.bool false | |
%1275 = torch.aten.arange %int131072_559, %none_560, %none_561, %cpu_562, %false_563 : !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[131072],si64> | |
%int0_564 = torch.constant.int 0 | |
%int128_565 = torch.constant.int 128 | |
%int2_566 = torch.constant.int 2 | |
%int4_567 = torch.constant.int 4 | |
%none_568 = torch.constant.none | |
%cpu_569 = torch.constant.device "cpu" | |
%false_570 = torch.constant.bool false | |
%1276 = torch.aten.arange.start_step %int0_564, %int128_565, %int2_566, %int4_567, %none_568, %cpu_569, %false_570 : !torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[64],si64> | |
%int6_571 = torch.constant.int 6 | |
%1277 = torch.prims.convert_element_type %1276, %int6_571 : !torch.vtensor<[64],si64>, !torch.int -> !torch.vtensor<[64],f32> | |
%int128_572 = torch.constant.int 128 | |
%1278 = torch.aten.div.Scalar %1277, %int128_572 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32> | |
%float5.000000e05_573 = torch.constant.float 5.000000e+05 | |
%1279 = torch.aten.pow.Scalar %float5.000000e05_573, %1278 : !torch.float, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32> | |
%1280 = torch.aten.reciprocal %1279 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32> | |
%float1.000000e00_574 = torch.constant.float 1.000000e+00 | |
%1281 = torch.aten.mul.Scalar %1280, %float1.000000e00_574 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],f32> | |
%1282 = torch.aten.reciprocal %1281 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32> | |
%float6.283190e00_575 = torch.constant.float 6.2831853071795862 | |
%1283 = torch.aten.mul.Scalar %1282, %float6.283190e00_575 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],f32> | |
%float8.192000e03_576 = torch.constant.float 8.192000e+03 | |
%1284 = torch.aten.gt.Scalar %1283, %float8.192000e03_576 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1> | |
%int8_577 = torch.constant.int 8 | |
%1285 = torch.aten.div.Scalar %1281, %int8_577 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32> | |
%1286 = torch.aten.where.self %1284, %1285, %1281 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32> | |
%1287 = torch.aten.reciprocal %1283 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32> | |
%int8192_578 = torch.constant.int 8192 | |
%1288 = torch.aten.mul.Scalar %1287, %int8192_578 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32> | |
%int1_579 = torch.constant.int 1 | |
%int1_580 = torch.constant.int 1 | |
%1289 = torch.aten.sub.Scalar %1288, %int1_579, %int1_580 : !torch.vtensor<[64],f32>, !torch.int, !torch.int -> !torch.vtensor<[64],f32> | |
%int3_581 = torch.constant.int 3 | |
%1290 = torch.aten.div.Scalar %1289, %int3_581 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32> | |
%int1_582 = torch.constant.int 1 | |
%int1_583 = torch.constant.int 1 | |
%1291 = torch.aten.rsub.Scalar %1290, %int1_582, %int1_583 : !torch.vtensor<[64],f32>, !torch.int, !torch.int -> !torch.vtensor<[64],f32> | |
%1292 = torch.aten.mul.Tensor %1291, %1286 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32> | |
%int8_584 = torch.constant.int 8 | |
%1293 = torch.aten.div.Scalar %1292, %int8_584 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32> | |
%1294 = torch.aten.mul.Tensor %1290, %1286 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32> | |
%int1_585 = torch.constant.int 1 | |
%1295 = torch.aten.add.Tensor %1293, %1294, %int1_585 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32> | |
%float2.048000e03_586 = torch.constant.float 2.048000e+03 | |
%1296 = torch.aten.lt.Scalar %1283, %float2.048000e03_586 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1> | |
%1297 = torch.aten.bitwise_not %1296 : !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1> | |
%float8.192000e03_587 = torch.constant.float 8.192000e+03 | |
%1298 = torch.aten.gt.Scalar %1283, %float8.192000e03_587 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1> | |
%1299 = torch.aten.bitwise_not %1298 : !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1> | |
%1300 = torch.aten.mul.Tensor %1297, %1299 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1> | |
%1301 = torch.aten.where.self %1300, %1295, %1286 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32> | |
%1302 = torch.prim.ListConstruct %1301, %1301 : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor> | |
%int-1_588 = torch.constant.int -1 | |
%1303 = torch.aten.cat %1302, %int-1_588 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[128],f32> | |
%int6_589 = torch.constant.int 6 | |
%1304 = torch.prims.convert_element_type %1303, %int6_589 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[128],f32> | |
%int1_590 = torch.constant.int 1 | |
%1305 = torch.aten.unsqueeze %1275, %int1_590 : !torch.vtensor<[131072],si64>, !torch.int -> !torch.vtensor<[131072,1],si64> | |
%int6_591 = torch.constant.int 6 | |
%1306 = torch.prims.convert_element_type %1305, %int6_591 : !torch.vtensor<[131072,1],si64>, !torch.int -> !torch.vtensor<[131072,1],f32> | |
%int0_592 = torch.constant.int 0 | |
%1307 = torch.aten.unsqueeze %1304, %int0_592 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[1,128],f32> | |
%int6_593 = torch.constant.int 6 | |
%1308 = torch.prims.convert_element_type %1307, %int6_593 : !torch.vtensor<[1,128],f32>, !torch.int -> !torch.vtensor<[1,128],f32> | |
%1309 = torch.aten.mul.Tensor %1306, %1308 : !torch.vtensor<[131072,1],f32>, !torch.vtensor<[1,128],f32> -> !torch.vtensor<[131072,128],f32> | |
%1310 = torch.aten.cos %1309 : !torch.vtensor<[131072,128],f32> -> !torch.vtensor<[131072,128],f32> | |
%int15_594 = torch.constant.int 15 | |
%1311 = torch.prims.convert_element_type %1310, %int15_594 : !torch.vtensor<[131072,128],f32>, !torch.int -> !torch.vtensor<[131072,128],bf16> | |
%1312 = torch.aten.sin %1309 : !torch.vtensor<[131072,128],f32> -> !torch.vtensor<[131072,128],f32> | |
%int15_595 = torch.constant.int 15 | |
%1313 = torch.prims.convert_element_type %1312, %int15_595 : !torch.vtensor<[131072,128],f32>, !torch.int -> !torch.vtensor<[131072,128],bf16> | |
%int0_596 = torch.constant.int 0 | |
%int0_597 = torch.constant.int 0 | |
%int1_598 = torch.constant.int 1 | |
%1314 = torch.aten.slice.Tensor %1311, %int0_596, %int0_597, %777, %int1_598 : !torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16> | |
torch.bind_symbolic_shape %1314, [%773], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16> | |
%int1_599 = torch.constant.int 1 | |
%int0_600 = torch.constant.int 0 | |
%int9223372036854775807_601 = torch.constant.int 9223372036854775807 | |
%int1_602 = torch.constant.int 1 | |
%1315 = torch.aten.slice.Tensor %1314, %int1_599, %int0_600, %int9223372036854775807_601, %int1_602 : !torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16> | |
torch.bind_symbolic_shape %1315, [%773], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16> | |
%int0_603 = torch.constant.int 0 | |
%int0_604 = torch.constant.int 0 | |
%int1_605 = torch.constant.int 1 | |
%1316 = torch.aten.slice.Tensor %1313, %int0_603, %int0_604, %777, %int1_605 : !torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16> | |
torch.bind_symbolic_shape %1316, [%773], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16> | |
%int1_606 = torch.constant.int 1 | |
%int0_607 = torch.constant.int 0 | |
%int9223372036854775807_608 = torch.constant.int 9223372036854775807 | |
%int1_609 = torch.constant.int 1 | |
%1317 = torch.aten.slice.Tensor %1316, %int1_606, %int0_607, %int9223372036854775807_608, %int1_609 : !torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16> | |
torch.bind_symbolic_shape %1317, [%773], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16> | |
%int0_610 = torch.constant.int 0 | |
%1318 = torch.aten.unsqueeze %1315, %int0_610 : !torch.vtensor<[?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,128],bf16> | |
torch.bind_symbolic_shape %1318, [%773], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16> | |
%int1_611 = torch.constant.int 1 | |
%int0_612 = torch.constant.int 0 | |
%int9223372036854775807_613 = torch.constant.int 9223372036854775807 | |
%int1_614 = torch.constant.int 1 | |
%1319 = torch.aten.slice.Tensor %1318, %int1_611, %int0_612, %int9223372036854775807_613, %int1_614 : !torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],bf16> | |
torch.bind_symbolic_shape %1319, [%773], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16> | |
%int2_615 = torch.constant.int 2 | |
%1320 = torch.aten.unsqueeze %1319, %int2_615 : !torch.vtensor<[1,?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,1,128],bf16> | |
torch.bind_symbolic_shape %1320, [%773], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16> | |
%int3_616 = torch.constant.int 3 | |
%int0_617 = torch.constant.int 0 | |
%int9223372036854775807_618 = torch.constant.int 9223372036854775807 | |
%int1_619 = torch.constant.int 1 | |
%1321 = torch.aten.slice.Tensor %1320, %int3_616, %int0_617, %int9223372036854775807_618, %int1_619 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,1,128],bf16> | |
torch.bind_symbolic_shape %1321, [%773], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16> | |
%int4_620 = torch.constant.int 4 | |
%int1_621 = torch.constant.int 1 | |
%int1_622 = torch.constant.int 1 | |
%int1_623 = torch.constant.int 1 | |
%1322 = torch.prim.ListConstruct %int4_620, %int1_621, %int1_622, %int1_623 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%1323 = torch.aten.repeat %1321, %1322 : !torch.vtensor<[1,?,1,128],bf16>, !torch.list<int> -> !torch.vtensor<[4,?,1,128],bf16> | |
torch.bind_symbolic_shape %1323, [%773], affine_map<()[s0] -> (4, s0 * 32, 1, 128)> : !torch.vtensor<[4,?,1,128],bf16> | |
%int0_624 = torch.constant.int 0 | |
%1324 = torch.aten.unsqueeze %1317, %int0_624 : !torch.vtensor<[?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,128],bf16> | |
torch.bind_symbolic_shape %1324, [%773], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16> | |
%int1_625 = torch.constant.int 1 | |
%int0_626 = torch.constant.int 0 | |
%int9223372036854775807_627 = torch.constant.int 9223372036854775807 | |
%int1_628 = torch.constant.int 1 | |
%1325 = torch.aten.slice.Tensor %1324, %int1_625, %int0_626, %int9223372036854775807_627, %int1_628 : !torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],bf16> | |
torch.bind_symbolic_shape %1325, [%773], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16> | |
%int2_629 = torch.constant.int 2 | |
%1326 = torch.aten.unsqueeze %1325, %int2_629 : !torch.vtensor<[1,?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,1,128],bf16> | |
torch.bind_symbolic_shape %1326, [%773], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16> | |
%int3_630 = torch.constant.int 3 | |
%int0_631 = torch.constant.int 0 | |
%int9223372036854775807_632 = torch.constant.int 9223372036854775807 | |
%int1_633 = torch.constant.int 1 | |
%1327 = torch.aten.slice.Tensor %1326, %int3_630, %int0_631, %int9223372036854775807_632, %int1_633 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,1,128],bf16> | |
torch.bind_symbolic_shape %1327, [%773], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16> | |
%int4_634 = torch.constant.int 4 | |
%int1_635 = torch.constant.int 1 | |
%int1_636 = torch.constant.int 1 | |
%int1_637 = torch.constant.int 1 | |
%1328 = torch.prim.ListConstruct %int4_634, %int1_635, %int1_636, %int1_637 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%1329 = torch.aten.repeat %1327, %1328 : !torch.vtensor<[1,?,1,128],bf16>, !torch.list<int> -> !torch.vtensor<[4,?,1,128],bf16> | |
torch.bind_symbolic_shape %1329, [%773], affine_map<()[s0] -> (4, s0 * 32, 1, 128)> : !torch.vtensor<[4,?,1,128],bf16> | |
%1330 = torch.aten.mul.Tensor %1209, %1323 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16> -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1330, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 128)> : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ> | |
%int3_638 = torch.constant.int 3 | |
%int0_639 = torch.constant.int 0 | |
%int64_640 = torch.constant.int 64 | |
%int1_641 = torch.constant.int 1 | |
%1331 = torch.aten.slice.Tensor %1209, %int3_638, %int0_639, %int64_640, %int1_641 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1331, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 64)> : !torch.vtensor<[4,?,8,64],f8E4M3FNUZ> | |
%int3_642 = torch.constant.int 3 | |
%int64_643 = torch.constant.int 64 | |
%int9223372036854775807_644 = torch.constant.int 9223372036854775807 | |
%int1_645 = torch.constant.int 1 | |
%1332 = torch.aten.slice.Tensor %1209, %int3_642, %int64_643, %int9223372036854775807_644, %int1_645 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1332, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 64)> : !torch.vtensor<[4,?,8,64],f8E4M3FNUZ> | |
%1333 = torch.aten.neg %1332 : !torch.vtensor<[4,?,8,64],f8E4M3FNUZ> -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1333, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 64)> : !torch.vtensor<[4,?,8,64],f8E4M3FNUZ> | |
%1334 = torch.prim.ListConstruct %1333, %1331 : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>) -> !torch.list<vtensor> | |
%int-1_646 = torch.constant.int -1 | |
%1335 = torch.aten.cat %1334, %int-1_646 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1335, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 128)> : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ> | |
%1336 = torch.aten.mul.Tensor %1335, %1329 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16> -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1336, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 128)> : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ> | |
%int1_647 = torch.constant.int 1 | |
%1337 = torch.aten.add.Tensor %1330, %1336, %int1_647 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1337, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 128)> : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ> | |
%int64_648 = torch.constant.int 64 | |
%1338 = torch.aten.mul.Scalar %arg2, %int64_648 : !torch.vtensor<[4,?],si64>, !torch.int -> !torch.vtensor<[4,?],si64> | |
torch.bind_symbolic_shape %1338, [%773], affine_map<()[s0] -> (4, s0)> : !torch.vtensor<[4,?],si64> | |
%int2_649 = torch.constant.int 2 | |
%int1_650 = torch.constant.int 1 | |
%1339 = torch.aten.add.Scalar %1338, %int2_649, %int1_650 : !torch.vtensor<[4,?],si64>, !torch.int, !torch.int -> !torch.vtensor<[4,?],si64> | |
torch.bind_symbolic_shape %1339, [%773], affine_map<()[s0] -> (4, s0)> : !torch.vtensor<[4,?],si64> | |
%int4_651 = torch.constant.int 4 | |
%int32_652 = torch.constant.int 32 | |
%int8_653 = torch.constant.int 8 | |
%int128_654 = torch.constant.int 128 | |
%1340 = torch.prim.ListConstruct %int4_651, %775, %int32_652, %int8_653, %int128_654 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%1341 = torch.aten.view %1337, %1340 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1341, [%773], affine_map<()[s0] -> (4, s0, 32, 8, 128)> : !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ> | |
%int32_655 = torch.constant.int 32 | |
%int8_656 = torch.constant.int 8 | |
%int128_657 = torch.constant.int 128 | |
%1342 = torch.prim.ListConstruct %997, %int32_655, %int8_656, %int128_657 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%1343 = torch.aten.view %1341, %1342 : !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1343, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
%1344 = torch.prim.ListConstruct %997 : (!torch.int) -> !torch.list<int> | |
%1345 = torch.aten.view %1339, %1344 : !torch.vtensor<[4,?],si64>, !torch.list<int> -> !torch.vtensor<[?],si64> | |
torch.bind_symbolic_shape %1345, [%773], affine_map<()[s0] -> (s0 * 4)> : !torch.vtensor<[?],si64> | |
%int26_658 = torch.constant.int 26 | |
%1346 = torch.prims.convert_element_type %1343, %int26_658 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1346, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
%int1_659 = torch.constant.int 1 | |
%1347 = torch.aten.view.dtype %1346, %int1_659 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[?,32,8,128],si8> | |
torch.bind_symbolic_shape %1347, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8> | |
%1348 = torch.aten.detach %1347 : !torch.vtensor<[?,32,8,128],si8> -> !torch.vtensor<[?,32,8,128],si8> | |
torch.bind_symbolic_shape %1348, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8> | |
%1349 = torch.aten.detach %1348 : !torch.vtensor<[?,32,8,128],si8> -> !torch.vtensor<[?,32,8,128],si8> | |
torch.bind_symbolic_shape %1349, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8> | |
%int32_660 = torch.constant.int 32 | |
%int2_661 = torch.constant.int 2 | |
%int32_662 = torch.constant.int 32 | |
%int8_663 = torch.constant.int 8 | |
%int128_664 = torch.constant.int 128 | |
%1350 = torch.prim.ListConstruct %776, %int32_660, %int2_661, %int32_662, %int8_663, %int128_664 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%1351 = torch.aten.view %1044, %1350 : !torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1351, [%774], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ> | |
%int32_665 = torch.constant.int 32 | |
%int8_666 = torch.constant.int 8 | |
%int128_667 = torch.constant.int 128 | |
%1352 = torch.prim.ListConstruct %990, %int32_665, %int8_666, %int128_667 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%1353 = torch.aten.view %1351, %1352 : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1353, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
%int1_668 = torch.constant.int 1 | |
%1354 = torch.aten.view.dtype %1353, %int1_668 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[?,32,8,128],si8> | |
torch.bind_symbolic_shape %1354, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8> | |
%1355 = torch.aten.detach %1354 : !torch.vtensor<[?,32,8,128],si8> -> !torch.vtensor<[?,32,8,128],si8> | |
torch.bind_symbolic_shape %1355, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8> | |
%1356 = torch.aten.detach %1355 : !torch.vtensor<[?,32,8,128],si8> -> !torch.vtensor<[?,32,8,128],si8> | |
torch.bind_symbolic_shape %1356, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8> | |
%1357 = torch.prim.ListConstruct %1345 : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>> | |
%false_669 = torch.constant.bool false | |
%1358 = torch.aten.index_put %1356, %1357, %1349, %false_669 : !torch.vtensor<[?,32,8,128],si8>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],si8>, !torch.bool -> !torch.vtensor<[?,32,8,128],si8> | |
torch.bind_symbolic_shape %1358, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8> | |
%int26_670 = torch.constant.int 26 | |
%1359 = torch.aten.view.dtype %1358, %int26_670 : !torch.vtensor<[?,32,8,128],si8>, !torch.int -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1359, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
%1360 = torch.aten.detach %1359 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1360, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
%1361 = torch.aten.detach %1360 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1361, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
%int32_671 = torch.constant.int 32 | |
%int2_672 = torch.constant.int 2 | |
%int32_673 = torch.constant.int 32 | |
%int8_674 = torch.constant.int 8 | |
%int128_675 = torch.constant.int 128 | |
%1362 = torch.prim.ListConstruct %776, %int32_671, %int2_672, %int32_673, %int8_674, %int128_675 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%1363 = torch.aten.view %1361, %1362 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1363, [%774], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ> | |
%int2097152_676 = torch.constant.int 2097152 | |
%1364 = torch.prim.ListConstruct %776, %int2097152_676 : (!torch.int, !torch.int) -> !torch.list<int> | |
%1365 = torch.aten.view %1363, %1364 : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,2097152],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1365, [%774], affine_map<()[s0] -> (s0, 2097152)> : !torch.vtensor<[?,2097152],f8E4M3FNUZ> | |
%int4_677 = torch.constant.int 4 | |
%int32_678 = torch.constant.int 32 | |
%int8_679 = torch.constant.int 8 | |
%int128_680 = torch.constant.int 128 | |
%1366 = torch.prim.ListConstruct %int4_677, %775, %int32_678, %int8_679, %int128_680 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%1367 = torch.aten.view %1211, %1366 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1367, [%773], affine_map<()[s0] -> (4, s0, 32, 8, 128)> : !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ> | |
%int32_681 = torch.constant.int 32 | |
%int8_682 = torch.constant.int 8 | |
%int128_683 = torch.constant.int 128 | |
%1368 = torch.prim.ListConstruct %997, %int32_681, %int8_682, %int128_683 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%1369 = torch.aten.view %1367, %1368 : !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1369, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
%int1_684 = torch.constant.int 1 | |
%int1_685 = torch.constant.int 1 | |
%1370 = torch.aten.add.Scalar %1339, %int1_684, %int1_685 : !torch.vtensor<[4,?],si64>, !torch.int, !torch.int -> !torch.vtensor<[4,?],si64> | |
torch.bind_symbolic_shape %1370, [%773], affine_map<()[s0] -> (4, s0)> : !torch.vtensor<[4,?],si64> | |
%1371 = torch.prim.ListConstruct %997 : (!torch.int) -> !torch.list<int> | |
%1372 = torch.aten.view %1370, %1371 : !torch.vtensor<[4,?],si64>, !torch.list<int> -> !torch.vtensor<[?],si64> | |
torch.bind_symbolic_shape %1372, [%773], affine_map<()[s0] -> (s0 * 4)> : !torch.vtensor<[?],si64> | |
%int26_686 = torch.constant.int 26 | |
%1373 = torch.prims.convert_element_type %1369, %int26_686 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1373, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
%int1_687 = torch.constant.int 1 | |
%1374 = torch.aten.view.dtype %1373, %int1_687 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[?,32,8,128],si8> | |
torch.bind_symbolic_shape %1374, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8> | |
%1375 = torch.aten.detach %1374 : !torch.vtensor<[?,32,8,128],si8> -> !torch.vtensor<[?,32,8,128],si8> | |
torch.bind_symbolic_shape %1375, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8> | |
%1376 = torch.aten.detach %1375 : !torch.vtensor<[?,32,8,128],si8> -> !torch.vtensor<[?,32,8,128],si8> | |
torch.bind_symbolic_shape %1376, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8> | |
%int32_688 = torch.constant.int 32 | |
%int2_689 = torch.constant.int 2 | |
%int32_690 = torch.constant.int 32 | |
%int8_691 = torch.constant.int 8 | |
%int128_692 = torch.constant.int 128 | |
%1377 = torch.prim.ListConstruct %776, %int32_688, %int2_689, %int32_690, %int8_691, %int128_692 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%1378 = torch.aten.view %1365, %1377 : !torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1378, [%774], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ> | |
%int32_693 = torch.constant.int 32 | |
%int8_694 = torch.constant.int 8 | |
%int128_695 = torch.constant.int 128 | |
%1379 = torch.prim.ListConstruct %990, %int32_693, %int8_694, %int128_695 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%1380 = torch.aten.view %1378, %1379 : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1380, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
%int1_696 = torch.constant.int 1 | |
%1381 = torch.aten.view.dtype %1380, %int1_696 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[?,32,8,128],si8> | |
torch.bind_symbolic_shape %1381, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8> | |
%1382 = torch.aten.detach %1381 : !torch.vtensor<[?,32,8,128],si8> -> !torch.vtensor<[?,32,8,128],si8> | |
torch.bind_symbolic_shape %1382, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8> | |
%1383 = torch.aten.detach %1382 : !torch.vtensor<[?,32,8,128],si8> -> !torch.vtensor<[?,32,8,128],si8> | |
torch.bind_symbolic_shape %1383, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8> | |
%1384 = torch.prim.ListConstruct %1372 : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>> | |
%false_697 = torch.constant.bool false | |
%1385 = torch.aten.index_put %1383, %1384, %1376, %false_697 : !torch.vtensor<[?,32,8,128],si8>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],si8>, !torch.bool -> !torch.vtensor<[?,32,8,128],si8> | |
torch.bind_symbolic_shape %1385, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8> | |
%int26_698 = torch.constant.int 26 | |
%1386 = torch.aten.view.dtype %1385, %int26_698 : !torch.vtensor<[?,32,8,128],si8>, !torch.int -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1386, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
%1387 = torch.aten.detach %1386 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1387, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
%1388 = torch.aten.detach %1387 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1388, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
%int32_699 = torch.constant.int 32 | |
%int2_700 = torch.constant.int 2 | |
%int32_701 = torch.constant.int 32 | |
%int8_702 = torch.constant.int 8 | |
%int128_703 = torch.constant.int 128 | |
%1389 = torch.prim.ListConstruct %776, %int32_699, %int2_700, %int32_701, %int8_702, %int128_703 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%1390 = torch.aten.view %1388, %1389 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1390, [%774], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ> | |
%int2097152_704 = torch.constant.int 2097152 | |
%1391 = torch.prim.ListConstruct %776, %int2097152_704 : (!torch.int, !torch.int) -> !torch.list<int> | |
%1392 = torch.aten.view %1390, %1391 : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,2097152],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1392, [%774], affine_map<()[s0] -> (s0, 2097152)> : !torch.vtensor<[?,2097152],f8E4M3FNUZ> | |
%int-2_705 = torch.constant.int -2 | |
%1393 = torch.aten.unsqueeze %1337, %int-2_705 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1393, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 1, 128)> : !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ> | |
%int4_706 = torch.constant.int 4 | |
%int8_707 = torch.constant.int 8 | |
%int4_708 = torch.constant.int 4 | |
%int128_709 = torch.constant.int 128 | |
%1394 = torch.prim.ListConstruct %int4_706, %777, %int8_707, %int4_708, %int128_709 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%false_710 = torch.constant.bool false | |
%1395 = torch.aten.expand %1393, %1394, %false_710 : !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1395, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 4, 128)> : !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ> | |
%int0_711 = torch.constant.int 0 | |
%1396 = torch.aten.clone %1395, %int0_711 : !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1396, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 4, 128)> : !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ> | |
%int4_712 = torch.constant.int 4 | |
%int32_713 = torch.constant.int 32 | |
%int128_714 = torch.constant.int 128 | |
%1397 = torch.prim.ListConstruct %int4_712, %777, %int32_713, %int128_714 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%1398 = torch.aten._unsafe_view %1396, %1397 : !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1398, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ> | |
%int-2_715 = torch.constant.int -2 | |
%1399 = torch.aten.unsqueeze %1211, %int-2_715 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1399, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 1, 128)> : !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ> | |
%int4_716 = torch.constant.int 4 | |
%int8_717 = torch.constant.int 8 | |
%int4_718 = torch.constant.int 4 | |
%int128_719 = torch.constant.int 128 | |
%1400 = torch.prim.ListConstruct %int4_716, %777, %int8_717, %int4_718, %int128_719 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%false_720 = torch.constant.bool false | |
%1401 = torch.aten.expand %1399, %1400, %false_720 : !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1401, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 4, 128)> : !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ> | |
%int0_721 = torch.constant.int 0 | |
%1402 = torch.aten.clone %1401, %int0_721 : !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1402, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 4, 128)> : !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ> | |
%int4_722 = torch.constant.int 4 | |
%int32_723 = torch.constant.int 32 | |
%int128_724 = torch.constant.int 128 | |
%1403 = torch.prim.ListConstruct %int4_722, %777, %int32_723, %int128_724 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%1404 = torch.aten._unsafe_view %1402, %1403 : !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1404, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ> | |
%int1_725 = torch.constant.int 1 | |
%int2_726 = torch.constant.int 2 | |
%1405 = torch.aten.transpose.int %1274, %int1_725, %int2_726 : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1405, [%773], affine_map<()[s0] -> (4, 32, s0 * 32, 128)> : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> | |
%int1_727 = torch.constant.int 1 | |
%int2_728 = torch.constant.int 2 | |
%1406 = torch.aten.transpose.int %1398, %int1_727, %int2_728 : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1406, [%773], affine_map<()[s0] -> (4, 32, s0 * 32, 128)> : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> | |
%int1_729 = torch.constant.int 1 | |
%int2_730 = torch.constant.int 2 | |
%1407 = torch.aten.transpose.int %1404, %int1_729, %int2_730 : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1407, [%773], affine_map<()[s0] -> (4, 32, s0 * 32, 128)> : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> | |
%int26_731 = torch.constant.int 26 | |
%1408 = torch.prims.convert_element_type %1405, %int26_731 : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1408, [%773], affine_map<()[s0] -> (4, 32, s0 * 32, 128)> : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> | |
%int26_732 = torch.constant.int 26 | |
%1409 = torch.prims.convert_element_type %1406, %int26_732 : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1409, [%773], affine_map<()[s0] -> (4, 32, s0 * 32, 128)> : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> | |
%int26_733 = torch.constant.int 26 | |
%1410 = torch.prims.convert_element_type %1407, %int26_733 : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1410, [%773], affine_map<()[s0] -> (4, 32, s0 * 32, 128)> : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> | |
%int26_734 = torch.constant.int 26 | |
%1411 = torch.prims.convert_element_type %803, %int26_734 : !torch.vtensor<[4,1,?,?],f32>, !torch.int -> !torch.vtensor<[4,1,?,?],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1411, [%773], affine_map<()[s0] -> (4, 1, s0 * 32, s0 * 32)> : !torch.vtensor<[4,1,?,?],f8E4M3FNUZ> | |
%int0_735 = torch.constant.int 0 | |
%int0_736 = torch.constant.int 0 | |
%1412 = torch.aten.select.int %1411, %int0_735, %int0_736 : !torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[1,?,?],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1412, [%773], affine_map<()[s0] -> (1, s0 * 32, s0 * 32)> : !torch.vtensor<[1,?,?],f8E4M3FNUZ> | |
%int0_737 = torch.constant.int 0 | |
%int0_738 = torch.constant.int 0 | |
%1413 = torch.aten.select.int %1412, %int0_737, %int0_738 : !torch.vtensor<[1,?,?],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[?,?],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1413, [%773], affine_map<()[s0] -> (s0 * 32, s0 * 32)> : !torch.vtensor<[?,?],f8E4M3FNUZ> | |
%int0_739 = torch.constant.int 0 | |
%int0_740 = torch.constant.int 0 | |
%int9223372036854775807_741 = torch.constant.int 9223372036854775807 | |
%int1_742 = torch.constant.int 1 | |
%1414 = torch.aten.slice.Tensor %1413, %int0_739, %int0_740, %int9223372036854775807_741, %int1_742 : !torch.vtensor<[?,?],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,?],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1414, [%773], affine_map<()[s0] -> (s0 * 32, s0 * 32)> : !torch.vtensor<[?,?],f8E4M3FNUZ> | |
%int1_743 = torch.constant.int 1 | |
%int0_744 = torch.constant.int 0 | |
%int9223372036854775807_745 = torch.constant.int 9223372036854775807 | |
%int1_746 = torch.constant.int 1 | |
%1415 = torch.aten.slice.Tensor %1414, %int1_743, %int0_744, %int9223372036854775807_745, %int1_746 : !torch.vtensor<[?,?],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,?],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1415, [%773], affine_map<()[s0] -> (s0 * 32, s0 * 32)> : !torch.vtensor<[?,?],f8E4M3FNUZ> | |
%none_747 = torch.constant.none | |
%1416 = torch.aten.clone %35, %none_747 : !torch.vtensor<[],f32>, !torch.none -> !torch.vtensor<[],f32> | |
%1417 = torch.aten.detach %1416 : !torch.vtensor<[],f32> -> !torch.vtensor<[],f32> | |
%1418 = torch.aten.detach %1417 : !torch.vtensor<[],f32> -> !torch.vtensor<[],f32> | |
%1419 = torch.aten.detach %1418 : !torch.vtensor<[],f32> -> !torch.vtensor<[],f32> | |
%1420 = torch_c.to_builtin_tensor %1408 : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> -> tensor<4x32x?x128xf8E4M3FNUZ> | |
%1421 = torch_c.to_builtin_tensor %1409 : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> -> tensor<4x32x?x128xf8E4M3FNUZ> | |
%1422 = torch_c.to_builtin_tensor %1410 : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> -> tensor<4x32x?x128xf8E4M3FNUZ> | |
%1423 = torch_c.to_builtin_tensor %1415 : !torch.vtensor<[?,?],f8E4M3FNUZ> -> tensor<?x?xf8E4M3FNUZ> | |
%1424 = torch_c.to_builtin_tensor %1419 : !torch.vtensor<[],f32> -> tensor<f32> | |
%1425 = util.call @sharktank_masked_flash_attention_4_32_128_128_f8E4M3FNUZ_f8E4M3FNUZ_f32_f32(%1420, %1421, %1422, %1424, %1423) : (tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<f32>, tensor<?x?xf8E4M3FNUZ>) -> tensor<4x32x?x128xf32> | |
%1426 = torch_c.from_builtin_tensor %1425 : tensor<4x32x?x128xf32> -> !torch.vtensor<[4,32,?,128],f32> | |
torch.bind_symbolic_shape %1426, [%773], affine_map<()[s0] -> (4, 32, s0 * 32, 128)> : !torch.vtensor<[4,32,?,128],f32> | |
%int1_748 = torch.constant.int 1 | |
%int2_749 = torch.constant.int 2 | |
%1427 = torch.aten.transpose.int %1426, %int1_748, %int2_749 : !torch.vtensor<[4,32,?,128],f32>, !torch.int, !torch.int -> !torch.vtensor<[4,?,32,128],f32> | |
torch.bind_symbolic_shape %1427, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f32> | |
%int0_750 = torch.constant.int 0 | |
%1428 = torch.aten.clone %1427, %int0_750 : !torch.vtensor<[4,?,32,128],f32>, !torch.int -> !torch.vtensor<[4,?,32,128],f32> | |
torch.bind_symbolic_shape %1428, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f32> | |
%int4_751 = torch.constant.int 4 | |
%int4096_752 = torch.constant.int 4096 | |
%1429 = torch.prim.ListConstruct %int4_751, %777, %int4096_752 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%1430 = torch.aten._unsafe_view %1428, %1429 : !torch.vtensor<[4,?,32,128],f32>, !torch.list<int> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %1430, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%1431 = torch.aten.div.Tensor %1430, %36 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %1431, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%float-2.400000e02_753 = torch.constant.float -2.400000e+02 | |
%float2.400000e02_754 = torch.constant.float 2.400000e+02 | |
%1432 = torch.aten.clamp %1431, %float-2.400000e02_753, %float2.400000e02_754 : !torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %1432, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int26_755 = torch.constant.int 26 | |
%1433 = torch.prims.convert_element_type %1432, %int26_755 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1433, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> | |
%int0_756 = torch.constant.int 0 | |
%1434 = torch.aten.unsqueeze %37, %int0_756 : !torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ> | |
%int4_757 = torch.constant.int 4 | |
%int4096_758 = torch.constant.int 4096 | |
%int4096_759 = torch.constant.int 4096 | |
%1435 = torch.prim.ListConstruct %int4_757, %int4096_758, %int4096_759 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%false_760 = torch.constant.bool false | |
%1436 = torch.aten.expand %1434, %1435, %false_760 : !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ> | |
%1437 = torch_c.to_builtin_tensor %1433 : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> -> tensor<4x?x4096xf8E4M3FNUZ> | |
%1438 = torch_c.to_builtin_tensor %1436 : !torch.vtensor<[4,4096,4096],f8E4M3FNUZ> -> tensor<4x4096x4096xf8E4M3FNUZ> | |
%1439 = util.call @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ_f32(%1437, %1438) : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x?x4096xf32> | |
%1440 = torch_c.from_builtin_tensor %1439 : tensor<4x?x4096xf32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %1440, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%1441 = torch.prim.ListConstruct : () -> !torch.list<int> | |
%1442 = torch.aten.permute %38, %1441 : !torch.vtensor<[],f32>, !torch.list<int> -> !torch.vtensor<[],f32> | |
%1443 = torch.aten.mul.Tensor %36, %1442 : !torch.vtensor<[],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[],f32> | |
%int6_761 = torch.constant.int 6 | |
%1444 = torch.prims.convert_element_type %1440, %int6_761 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %1444, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%1445 = torch.aten.mul.Tensor %1444, %1443 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %1445, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int1_762 = torch.constant.int 1 | |
%1446 = torch.aten.add.Tensor %1156, %1445, %int1_762 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %1446, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int6_763 = torch.constant.int 6 | |
%1447 = torch.prims.convert_element_type %1446, %int6_763 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %1447, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int2_764 = torch.constant.int 2 | |
%1448 = torch.aten.pow.Tensor_Scalar %1447, %int2_764 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %1448, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int-1_765 = torch.constant.int -1 | |
%1449 = torch.prim.ListConstruct %int-1_765 : (!torch.int) -> !torch.list<int> | |
%true_766 = torch.constant.bool true | |
%none_767 = torch.constant.none | |
%1450 = torch.aten.mean.dim %1448, %1449, %true_766, %none_767 : !torch.vtensor<[4,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[4,?,1],f32> | |
torch.bind_symbolic_shape %1450, [%773], affine_map<()[s0] -> (4, s0 * 32, 1)> : !torch.vtensor<[4,?,1],f32> | |
%float1.000000e-05_768 = torch.constant.float 1.000000e-05 | |
%int1_769 = torch.constant.int 1 | |
%1451 = torch.aten.add.Scalar %1450, %float1.000000e-05_768, %int1_769 : !torch.vtensor<[4,?,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[4,?,1],f32> | |
torch.bind_symbolic_shape %1451, [%773], affine_map<()[s0] -> (4, s0 * 32, 1)> : !torch.vtensor<[4,?,1],f32> | |
%1452 = torch.aten.rsqrt %1451 : !torch.vtensor<[4,?,1],f32> -> !torch.vtensor<[4,?,1],f32> | |
torch.bind_symbolic_shape %1452, [%773], affine_map<()[s0] -> (4, s0 * 32, 1)> : !torch.vtensor<[4,?,1],f32> | |
%1453 = torch.aten.mul.Tensor %1447, %1452 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,1],f32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %1453, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int6_770 = torch.constant.int 6 | |
%1454 = torch.prims.convert_element_type %1453, %int6_770 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %1454, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%1455 = torch.aten.mul.Tensor %39, %1454 : !torch.vtensor<[4096],bf16>, !torch.vtensor<[4,?,4096],f32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %1455, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int6_771 = torch.constant.int 6 | |
%1456 = torch.prims.convert_element_type %1455, %int6_771 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %1456, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%1457 = torch.aten.div.Tensor %1456, %40 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %1457, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%float-2.400000e02_772 = torch.constant.float -2.400000e+02 | |
%float2.400000e02_773 = torch.constant.float 2.400000e+02 | |
%1458 = torch.aten.clamp %1457, %float-2.400000e02_772, %float2.400000e02_773 : !torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %1458, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int26_774 = torch.constant.int 26 | |
%1459 = torch.prims.convert_element_type %1458, %int26_774 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1459, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> | |
%int0_775 = torch.constant.int 0 | |
%1460 = torch.aten.unsqueeze %41, %int0_775 : !torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ> | |
%int4_776 = torch.constant.int 4 | |
%int14336_777 = torch.constant.int 14336 | |
%int4096_778 = torch.constant.int 4096 | |
%1461 = torch.prim.ListConstruct %int4_776, %int14336_777, %int4096_778 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%false_779 = torch.constant.bool false | |
%1462 = torch.aten.expand %1460, %1461, %false_779 : !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ> | |
%1463 = torch_c.to_builtin_tensor %1459 : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> -> tensor<4x?x4096xf8E4M3FNUZ> | |
%1464 = torch_c.to_builtin_tensor %1462 : !torch.vtensor<[4,14336,4096],f8E4M3FNUZ> -> tensor<4x14336x4096xf8E4M3FNUZ> | |
%1465 = util.call @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ_f32(%1463, %1464) : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x?x14336xf32> | |
%1466 = torch_c.from_builtin_tensor %1465 : tensor<4x?x14336xf32> -> !torch.vtensor<[4,?,14336],f32> | |
torch.bind_symbolic_shape %1466, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32> | |
%1467 = torch.prim.ListConstruct : () -> !torch.list<int> | |
%1468 = torch.aten.permute %42, %1467 : !torch.vtensor<[],f32>, !torch.list<int> -> !torch.vtensor<[],f32> | |
%1469 = torch.aten.mul.Tensor %40, %1468 : !torch.vtensor<[],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[],f32> | |
%int6_780 = torch.constant.int 6 | |
%1470 = torch.prims.convert_element_type %1466, %int6_780 : !torch.vtensor<[4,?,14336],f32>, !torch.int -> !torch.vtensor<[4,?,14336],f32> | |
torch.bind_symbolic_shape %1470, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32> | |
%1471 = torch.aten.mul.Tensor %1470, %1469 : !torch.vtensor<[4,?,14336],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,14336],f32> | |
torch.bind_symbolic_shape %1471, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32> | |
%1472 = torch.aten.silu %1471 : !torch.vtensor<[4,?,14336],f32> -> !torch.vtensor<[4,?,14336],f32> | |
torch.bind_symbolic_shape %1472, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32> | |
%1473 = torch.aten.div.Tensor %1456, %43 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %1473, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%float-2.400000e02_781 = torch.constant.float -2.400000e+02 | |
%float2.400000e02_782 = torch.constant.float 2.400000e+02 | |
%1474 = torch.aten.clamp %1473, %float-2.400000e02_781, %float2.400000e02_782 : !torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %1474, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int26_783 = torch.constant.int 26 | |
%1475 = torch.prims.convert_element_type %1474, %int26_783 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1475, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> | |
%int0_784 = torch.constant.int 0 | |
%1476 = torch.aten.unsqueeze %44, %int0_784 : !torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ> | |
%int4_785 = torch.constant.int 4 | |
%int14336_786 = torch.constant.int 14336 | |
%int4096_787 = torch.constant.int 4096 | |
%1477 = torch.prim.ListConstruct %int4_785, %int14336_786, %int4096_787 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%false_788 = torch.constant.bool false | |
%1478 = torch.aten.expand %1476, %1477, %false_788 : !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ> | |
%1479 = torch_c.to_builtin_tensor %1475 : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> -> tensor<4x?x4096xf8E4M3FNUZ> | |
%1480 = torch_c.to_builtin_tensor %1478 : !torch.vtensor<[4,14336,4096],f8E4M3FNUZ> -> tensor<4x14336x4096xf8E4M3FNUZ> | |
%1481 = util.call @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ_f32(%1479, %1480) : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x?x14336xf32> | |
%1482 = torch_c.from_builtin_tensor %1481 : tensor<4x?x14336xf32> -> !torch.vtensor<[4,?,14336],f32> | |
torch.bind_symbolic_shape %1482, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32> | |
%1483 = torch.prim.ListConstruct : () -> !torch.list<int> | |
%1484 = torch.aten.permute %45, %1483 : !torch.vtensor<[],f32>, !torch.list<int> -> !torch.vtensor<[],f32> | |
%1485 = torch.aten.mul.Tensor %43, %1484 : !torch.vtensor<[],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[],f32> | |
%int6_789 = torch.constant.int 6 | |
%1486 = torch.prims.convert_element_type %1482, %int6_789 : !torch.vtensor<[4,?,14336],f32>, !torch.int -> !torch.vtensor<[4,?,14336],f32> | |
torch.bind_symbolic_shape %1486, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32> | |
%1487 = torch.aten.mul.Tensor %1486, %1485 : !torch.vtensor<[4,?,14336],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,14336],f32> | |
torch.bind_symbolic_shape %1487, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32> | |
%1488 = torch.aten.mul.Tensor %1472, %1487 : !torch.vtensor<[4,?,14336],f32>, !torch.vtensor<[4,?,14336],f32> -> !torch.vtensor<[4,?,14336],f32> | |
torch.bind_symbolic_shape %1488, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32> | |
%1489 = torch.aten.div.Tensor %1488, %46 : !torch.vtensor<[4,?,14336],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,14336],f32> | |
torch.bind_symbolic_shape %1489, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32> | |
%float-2.400000e02_790 = torch.constant.float -2.400000e+02 | |
%float2.400000e02_791 = torch.constant.float 2.400000e+02 | |
%1490 = torch.aten.clamp %1489, %float-2.400000e02_790, %float2.400000e02_791 : !torch.vtensor<[4,?,14336],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,14336],f32> | |
torch.bind_symbolic_shape %1490, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32> | |
%int26_792 = torch.constant.int 26 | |
%1491 = torch.prims.convert_element_type %1490, %int26_792 : !torch.vtensor<[4,?,14336],f32>, !torch.int -> !torch.vtensor<[4,?,14336],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1491, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f8E4M3FNUZ> | |
%int0_793 = torch.constant.int 0 | |
%1492 = torch.aten.unsqueeze %47, %int0_793 : !torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,4096,14336],f8E4M3FNUZ> | |
%int4_794 = torch.constant.int 4 | |
%int4096_795 = torch.constant.int 4096 | |
%int14336_796 = torch.constant.int 14336 | |
%1493 = torch.prim.ListConstruct %int4_794, %int4096_795, %int14336_796 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%false_797 = torch.constant.bool false | |
%1494 = torch.aten.expand %1492, %1493, %false_797 : !torch.vtensor<[1,4096,14336],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,4096,14336],f8E4M3FNUZ> | |
%1495 = torch_c.to_builtin_tensor %1491 : !torch.vtensor<[4,?,14336],f8E4M3FNUZ> -> tensor<4x?x14336xf8E4M3FNUZ> | |
%1496 = torch_c.to_builtin_tensor %1494 : !torch.vtensor<[4,4096,14336],f8E4M3FNUZ> -> tensor<4x4096x14336xf8E4M3FNUZ> | |
%1497 = util.call @sharktank_batch_matmul_transpose_b_L4xDx14336xf8E4M3FNUZ_R4x4096x14336xf8E4M3FNUZ_f32(%1495, %1496) : (tensor<4x?x14336xf8E4M3FNUZ>, tensor<4x4096x14336xf8E4M3FNUZ>) -> tensor<4x?x4096xf32> | |
%1498 = torch_c.from_builtin_tensor %1497 : tensor<4x?x4096xf32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %1498, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%1499 = torch.prim.ListConstruct : () -> !torch.list<int> | |
%1500 = torch.aten.permute %48, %1499 : !torch.vtensor<[],f32>, !torch.list<int> -> !torch.vtensor<[],f32> | |
%1501 = torch.aten.mul.Tensor %46, %1500 : !torch.vtensor<[],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[],f32> | |
%int6_798 = torch.constant.int 6 | |
%1502 = torch.prims.convert_element_type %1498, %int6_798 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %1502, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%1503 = torch.aten.mul.Tensor %1502, %1501 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %1503, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int1_799 = torch.constant.int 1 | |
%1504 = torch.aten.add.Tensor %1446, %1503, %int1_799 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %1504, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int6_800 = torch.constant.int 6 | |
%1505 = torch.prims.convert_element_type %1504, %int6_800 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %1505, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int2_801 = torch.constant.int 2 | |
%1506 = torch.aten.pow.Tensor_Scalar %1505, %int2_801 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %1506, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int-1_802 = torch.constant.int -1 | |
%1507 = torch.prim.ListConstruct %int-1_802 : (!torch.int) -> !torch.list<int> | |
%true_803 = torch.constant.bool true | |
%none_804 = torch.constant.none | |
%1508 = torch.aten.mean.dim %1506, %1507, %true_803, %none_804 : !torch.vtensor<[4,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[4,?,1],f32> | |
torch.bind_symbolic_shape %1508, [%773], affine_map<()[s0] -> (4, s0 * 32, 1)> : !torch.vtensor<[4,?,1],f32> | |
%float1.000000e-05_805 = torch.constant.float 1.000000e-05 | |
%int1_806 = torch.constant.int 1 | |
%1509 = torch.aten.add.Scalar %1508, %float1.000000e-05_805, %int1_806 : !torch.vtensor<[4,?,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[4,?,1],f32> | |
torch.bind_symbolic_shape %1509, [%773], affine_map<()[s0] -> (4, s0 * 32, 1)> : !torch.vtensor<[4,?,1],f32> | |
%1510 = torch.aten.rsqrt %1509 : !torch.vtensor<[4,?,1],f32> -> !torch.vtensor<[4,?,1],f32> | |
torch.bind_symbolic_shape %1510, [%773], affine_map<()[s0] -> (4, s0 * 32, 1)> : !torch.vtensor<[4,?,1],f32> | |
%1511 = torch.aten.mul.Tensor %1505, %1510 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,1],f32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %1511, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int6_807 = torch.constant.int 6 | |
%1512 = torch.prims.convert_element_type %1511, %int6_807 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %1512, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%1513 = torch.aten.mul.Tensor %49, %1512 : !torch.vtensor<[4096],bf16>, !torch.vtensor<[4,?,4096],f32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %1513, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int6_808 = torch.constant.int 6 | |
%1514 = torch.prims.convert_element_type %1513, %int6_808 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %1514, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%1515 = torch.aten.div.Tensor %1514, %50 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %1515, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%float-2.400000e02_809 = torch.constant.float -2.400000e+02 | |
%float2.400000e02_810 = torch.constant.float 2.400000e+02 | |
%1516 = torch.aten.clamp %1515, %float-2.400000e02_809, %float2.400000e02_810 : !torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %1516, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int26_811 = torch.constant.int 26 | |
%1517 = torch.prims.convert_element_type %1516, %int26_811 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1517, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> | |
%int0_812 = torch.constant.int 0 | |
%1518 = torch.aten.unsqueeze %51, %int0_812 : !torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ> | |
%int4_813 = torch.constant.int 4 | |
%int4096_814 = torch.constant.int 4096 | |
%int4096_815 = torch.constant.int 4096 | |
%1519 = torch.prim.ListConstruct %int4_813, %int4096_814, %int4096_815 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%false_816 = torch.constant.bool false | |
%1520 = torch.aten.expand %1518, %1519, %false_816 : !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ> | |
%1521 = torch_c.to_builtin_tensor %1517 : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> -> tensor<4x?x4096xf8E4M3FNUZ> | |
%1522 = torch_c.to_builtin_tensor %1520 : !torch.vtensor<[4,4096,4096],f8E4M3FNUZ> -> tensor<4x4096x4096xf8E4M3FNUZ> | |
%1523 = util.call @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ_f32(%1521, %1522) : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x?x4096xf32> | |
%1524 = torch_c.from_builtin_tensor %1523 : tensor<4x?x4096xf32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %1524, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%1525 = torch.aten.div.Tensor %1524, %52 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %1525, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%float-2.400000e02_817 = torch.constant.float -2.400000e+02 | |
%float2.400000e02_818 = torch.constant.float 2.400000e+02 | |
%1526 = torch.aten.clamp %1525, %float-2.400000e02_817, %float2.400000e02_818 : !torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %1526, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int26_819 = torch.constant.int 26 | |
%1527 = torch.prims.convert_element_type %1526, %int26_819 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1527, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> | |
%1528 = torch.aten.div.Tensor %1514, %53 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %1528, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%float-2.400000e02_820 = torch.constant.float -2.400000e+02 | |
%float2.400000e02_821 = torch.constant.float 2.400000e+02 | |
%1529 = torch.aten.clamp %1528, %float-2.400000e02_820, %float2.400000e02_821 : !torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %1529, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int26_822 = torch.constant.int 26 | |
%1530 = torch.prims.convert_element_type %1529, %int26_822 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1530, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> | |
%int0_823 = torch.constant.int 0 | |
%1531 = torch.aten.unsqueeze %54, %int0_823 : !torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ> | |
%int4_824 = torch.constant.int 4 | |
%int1024_825 = torch.constant.int 1024 | |
%int4096_826 = torch.constant.int 4096 | |
%1532 = torch.prim.ListConstruct %int4_824, %int1024_825, %int4096_826 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%false_827 = torch.constant.bool false | |
%1533 = torch.aten.expand %1531, %1532, %false_827 : !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ> | |
%1534 = torch_c.to_builtin_tensor %1530 : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> -> tensor<4x?x4096xf8E4M3FNUZ> | |
%1535 = torch_c.to_builtin_tensor %1533 : !torch.vtensor<[4,1024,4096],f8E4M3FNUZ> -> tensor<4x1024x4096xf8E4M3FNUZ> | |
%1536 = util.call @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ_f32(%1534, %1535) : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x?x1024xf32> | |
%1537 = torch_c.from_builtin_tensor %1536 : tensor<4x?x1024xf32> -> !torch.vtensor<[4,?,1024],f32> | |
torch.bind_symbolic_shape %1537, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f32> | |
%1538 = torch.aten.div.Tensor %1537, %55 : !torch.vtensor<[4,?,1024],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,1024],f32> | |
torch.bind_symbolic_shape %1538, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f32> | |
%float-2.400000e02_828 = torch.constant.float -2.400000e+02 | |
%float2.400000e02_829 = torch.constant.float 2.400000e+02 | |
%1539 = torch.aten.clamp %1538, %float-2.400000e02_828, %float2.400000e02_829 : !torch.vtensor<[4,?,1024],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,1024],f32> | |
torch.bind_symbolic_shape %1539, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f32> | |
%int26_830 = torch.constant.int 26 | |
%1540 = torch.prims.convert_element_type %1539, %int26_830 : !torch.vtensor<[4,?,1024],f32>, !torch.int -> !torch.vtensor<[4,?,1024],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1540, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f8E4M3FNUZ> | |
%1541 = torch.aten.div.Tensor %1514, %56 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %1541, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%float-2.400000e02_831 = torch.constant.float -2.400000e+02 | |
%float2.400000e02_832 = torch.constant.float 2.400000e+02 | |
%1542 = torch.aten.clamp %1541, %float-2.400000e02_831, %float2.400000e02_832 : !torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %1542, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int26_833 = torch.constant.int 26 | |
%1543 = torch.prims.convert_element_type %1542, %int26_833 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1543, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> | |
%int0_834 = torch.constant.int 0 | |
%1544 = torch.aten.unsqueeze %57, %int0_834 : !torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ> | |
%int4_835 = torch.constant.int 4 | |
%int1024_836 = torch.constant.int 1024 | |
%int4096_837 = torch.constant.int 4096 | |
%1545 = torch.prim.ListConstruct %int4_835, %int1024_836, %int4096_837 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%false_838 = torch.constant.bool false | |
%1546 = torch.aten.expand %1544, %1545, %false_838 : !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ> | |
%1547 = torch_c.to_builtin_tensor %1543 : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> -> tensor<4x?x4096xf8E4M3FNUZ> | |
%1548 = torch_c.to_builtin_tensor %1546 : !torch.vtensor<[4,1024,4096],f8E4M3FNUZ> -> tensor<4x1024x4096xf8E4M3FNUZ> | |
%1549 = util.call @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ_f32(%1547, %1548) : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x?x1024xf32> | |
%1550 = torch_c.from_builtin_tensor %1549 : tensor<4x?x1024xf32> -> !torch.vtensor<[4,?,1024],f32> | |
torch.bind_symbolic_shape %1550, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f32> | |
%1551 = torch.aten.div.Tensor %1550, %58 : !torch.vtensor<[4,?,1024],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,1024],f32> | |
torch.bind_symbolic_shape %1551, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f32> | |
%float-2.400000e02_839 = torch.constant.float -2.400000e+02 | |
%float2.400000e02_840 = torch.constant.float 2.400000e+02 | |
%1552 = torch.aten.clamp %1551, %float-2.400000e02_839, %float2.400000e02_840 : !torch.vtensor<[4,?,1024],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,1024],f32> | |
torch.bind_symbolic_shape %1552, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f32> | |
%int26_841 = torch.constant.int 26 | |
%1553 = torch.prims.convert_element_type %1552, %int26_841 : !torch.vtensor<[4,?,1024],f32>, !torch.int -> !torch.vtensor<[4,?,1024],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1553, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f8E4M3FNUZ> | |
%int4_842 = torch.constant.int 4 | |
%int32_843 = torch.constant.int 32 | |
%int128_844 = torch.constant.int 128 | |
%1554 = torch.prim.ListConstruct %int4_842, %777, %int32_843, %int128_844 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%1555 = torch.aten.view %1527, %1554 : !torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1555, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ> | |
%int4_845 = torch.constant.int 4 | |
%int8_846 = torch.constant.int 8 | |
%int128_847 = torch.constant.int 128 | |
%1556 = torch.prim.ListConstruct %int4_845, %777, %int8_846, %int128_847 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%1557 = torch.aten.view %1540, %1556 : !torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1557, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 128)> : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ> | |
%int4_848 = torch.constant.int 4 | |
%int8_849 = torch.constant.int 8 | |
%int128_850 = torch.constant.int 128 | |
%1558 = torch.prim.ListConstruct %int4_848, %777, %int8_849, %int128_850 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%1559 = torch.aten.view %1553, %1558 : !torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1559, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 128)> : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ> | |
%int131072_851 = torch.constant.int 131072 | |
%none_852 = torch.constant.none | |
%none_853 = torch.constant.none | |
%cpu_854 = torch.constant.device "cpu" | |
%false_855 = torch.constant.bool false | |
%1560 = torch.aten.arange %int131072_851, %none_852, %none_853, %cpu_854, %false_855 : !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[131072],si64> | |
%int0_856 = torch.constant.int 0 | |
%int128_857 = torch.constant.int 128 | |
%int2_858 = torch.constant.int 2 | |
%int4_859 = torch.constant.int 4 | |
%none_860 = torch.constant.none | |
%cpu_861 = torch.constant.device "cpu" | |
%false_862 = torch.constant.bool false | |
%1561 = torch.aten.arange.start_step %int0_856, %int128_857, %int2_858, %int4_859, %none_860, %cpu_861, %false_862 : !torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[64],si64> | |
%int6_863 = torch.constant.int 6 | |
%1562 = torch.prims.convert_element_type %1561, %int6_863 : !torch.vtensor<[64],si64>, !torch.int -> !torch.vtensor<[64],f32> | |
%int128_864 = torch.constant.int 128 | |
%1563 = torch.aten.div.Scalar %1562, %int128_864 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32> | |
%float5.000000e05_865 = torch.constant.float 5.000000e+05 | |
%1564 = torch.aten.pow.Scalar %float5.000000e05_865, %1563 : !torch.float, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32> | |
%1565 = torch.aten.reciprocal %1564 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32> | |
%float1.000000e00_866 = torch.constant.float 1.000000e+00 | |
%1566 = torch.aten.mul.Scalar %1565, %float1.000000e00_866 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],f32> | |
%1567 = torch.aten.reciprocal %1566 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32> | |
%float6.283190e00_867 = torch.constant.float 6.2831853071795862 | |
%1568 = torch.aten.mul.Scalar %1567, %float6.283190e00_867 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],f32> | |
%float8.192000e03_868 = torch.constant.float 8.192000e+03 | |
%1569 = torch.aten.gt.Scalar %1568, %float8.192000e03_868 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1> | |
%int8_869 = torch.constant.int 8 | |
%1570 = torch.aten.div.Scalar %1566, %int8_869 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32> | |
%1571 = torch.aten.where.self %1569, %1570, %1566 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32> | |
%1572 = torch.aten.reciprocal %1568 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32> | |
%int8192_870 = torch.constant.int 8192 | |
%1573 = torch.aten.mul.Scalar %1572, %int8192_870 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32> | |
%int1_871 = torch.constant.int 1 | |
%int1_872 = torch.constant.int 1 | |
%1574 = torch.aten.sub.Scalar %1573, %int1_871, %int1_872 : !torch.vtensor<[64],f32>, !torch.int, !torch.int -> !torch.vtensor<[64],f32> | |
%int3_873 = torch.constant.int 3 | |
%1575 = torch.aten.div.Scalar %1574, %int3_873 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32> | |
%int1_874 = torch.constant.int 1 | |
%int1_875 = torch.constant.int 1 | |
%1576 = torch.aten.rsub.Scalar %1575, %int1_874, %int1_875 : !torch.vtensor<[64],f32>, !torch.int, !torch.int -> !torch.vtensor<[64],f32> | |
%1577 = torch.aten.mul.Tensor %1576, %1571 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32> | |
%int8_876 = torch.constant.int 8 | |
%1578 = torch.aten.div.Scalar %1577, %int8_876 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32> | |
%1579 = torch.aten.mul.Tensor %1575, %1571 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32> | |
%int1_877 = torch.constant.int 1 | |
%1580 = torch.aten.add.Tensor %1578, %1579, %int1_877 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32> | |
%float2.048000e03_878 = torch.constant.float 2.048000e+03 | |
%1581 = torch.aten.lt.Scalar %1568, %float2.048000e03_878 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1> | |
%1582 = torch.aten.bitwise_not %1581 : !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1> | |
%float8.192000e03_879 = torch.constant.float 8.192000e+03 | |
%1583 = torch.aten.gt.Scalar %1568, %float8.192000e03_879 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1> | |
%1584 = torch.aten.bitwise_not %1583 : !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1> | |
%1585 = torch.aten.mul.Tensor %1582, %1584 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1> | |
%1586 = torch.aten.where.self %1585, %1580, %1571 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32> | |
%1587 = torch.prim.ListConstruct %1586, %1586 : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor> | |
%int-1_880 = torch.constant.int -1 | |
%1588 = torch.aten.cat %1587, %int-1_880 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[128],f32> | |
%int6_881 = torch.constant.int 6 | |
%1589 = torch.prims.convert_element_type %1588, %int6_881 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[128],f32> | |
%int1_882 = torch.constant.int 1 | |
%1590 = torch.aten.unsqueeze %1560, %int1_882 : !torch.vtensor<[131072],si64>, !torch.int -> !torch.vtensor<[131072,1],si64> | |
%int6_883 = torch.constant.int 6 | |
%1591 = torch.prims.convert_element_type %1590, %int6_883 : !torch.vtensor<[131072,1],si64>, !torch.int -> !torch.vtensor<[131072,1],f32> | |
%int0_884 = torch.constant.int 0 | |
%1592 = torch.aten.unsqueeze %1589, %int0_884 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[1,128],f32> | |
%int6_885 = torch.constant.int 6 | |
%1593 = torch.prims.convert_element_type %1592, %int6_885 : !torch.vtensor<[1,128],f32>, !torch.int -> !torch.vtensor<[1,128],f32> | |
%1594 = torch.aten.mul.Tensor %1591, %1593 : !torch.vtensor<[131072,1],f32>, !torch.vtensor<[1,128],f32> -> !torch.vtensor<[131072,128],f32> | |
%1595 = torch.aten.cos %1594 : !torch.vtensor<[131072,128],f32> -> !torch.vtensor<[131072,128],f32> | |
%int15_886 = torch.constant.int 15 | |
%1596 = torch.prims.convert_element_type %1595, %int15_886 : !torch.vtensor<[131072,128],f32>, !torch.int -> !torch.vtensor<[131072,128],bf16> | |
%1597 = torch.aten.sin %1594 : !torch.vtensor<[131072,128],f32> -> !torch.vtensor<[131072,128],f32> | |
%int15_887 = torch.constant.int 15 | |
%1598 = torch.prims.convert_element_type %1597, %int15_887 : !torch.vtensor<[131072,128],f32>, !torch.int -> !torch.vtensor<[131072,128],bf16> | |
%int0_888 = torch.constant.int 0 | |
%int0_889 = torch.constant.int 0 | |
%int1_890 = torch.constant.int 1 | |
%1599 = torch.aten.slice.Tensor %1596, %int0_888, %int0_889, %777, %int1_890 : !torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16> | |
torch.bind_symbolic_shape %1599, [%773], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16> | |
%int1_891 = torch.constant.int 1 | |
%int0_892 = torch.constant.int 0 | |
%int9223372036854775807_893 = torch.constant.int 9223372036854775807 | |
%int1_894 = torch.constant.int 1 | |
%1600 = torch.aten.slice.Tensor %1599, %int1_891, %int0_892, %int9223372036854775807_893, %int1_894 : !torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16> | |
torch.bind_symbolic_shape %1600, [%773], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16> | |
%int0_895 = torch.constant.int 0 | |
%int0_896 = torch.constant.int 0 | |
%int1_897 = torch.constant.int 1 | |
%1601 = torch.aten.slice.Tensor %1598, %int0_895, %int0_896, %777, %int1_897 : !torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16> | |
torch.bind_symbolic_shape %1601, [%773], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16> | |
%int1_898 = torch.constant.int 1 | |
%int0_899 = torch.constant.int 0 | |
%int9223372036854775807_900 = torch.constant.int 9223372036854775807 | |
%int1_901 = torch.constant.int 1 | |
%1602 = torch.aten.slice.Tensor %1601, %int1_898, %int0_899, %int9223372036854775807_900, %int1_901 : !torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16> | |
torch.bind_symbolic_shape %1602, [%773], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16> | |
%int0_902 = torch.constant.int 0 | |
%1603 = torch.aten.unsqueeze %1600, %int0_902 : !torch.vtensor<[?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,128],bf16> | |
torch.bind_symbolic_shape %1603, [%773], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16> | |
%int1_903 = torch.constant.int 1 | |
%int0_904 = torch.constant.int 0 | |
%int9223372036854775807_905 = torch.constant.int 9223372036854775807 | |
%int1_906 = torch.constant.int 1 | |
%1604 = torch.aten.slice.Tensor %1603, %int1_903, %int0_904, %int9223372036854775807_905, %int1_906 : !torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],bf16> | |
torch.bind_symbolic_shape %1604, [%773], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16> | |
%int2_907 = torch.constant.int 2 | |
%1605 = torch.aten.unsqueeze %1604, %int2_907 : !torch.vtensor<[1,?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,1,128],bf16> | |
torch.bind_symbolic_shape %1605, [%773], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16> | |
%int3_908 = torch.constant.int 3 | |
%int0_909 = torch.constant.int 0 | |
%int9223372036854775807_910 = torch.constant.int 9223372036854775807 | |
%int1_911 = torch.constant.int 1 | |
%1606 = torch.aten.slice.Tensor %1605, %int3_908, %int0_909, %int9223372036854775807_910, %int1_911 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,1,128],bf16> | |
torch.bind_symbolic_shape %1606, [%773], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16> | |
%int4_912 = torch.constant.int 4 | |
%int1_913 = torch.constant.int 1 | |
%int1_914 = torch.constant.int 1 | |
%int1_915 = torch.constant.int 1 | |
%1607 = torch.prim.ListConstruct %int4_912, %int1_913, %int1_914, %int1_915 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%1608 = torch.aten.repeat %1606, %1607 : !torch.vtensor<[1,?,1,128],bf16>, !torch.list<int> -> !torch.vtensor<[4,?,1,128],bf16> | |
torch.bind_symbolic_shape %1608, [%773], affine_map<()[s0] -> (4, s0 * 32, 1, 128)> : !torch.vtensor<[4,?,1,128],bf16> | |
%int0_916 = torch.constant.int 0 | |
%1609 = torch.aten.unsqueeze %1602, %int0_916 : !torch.vtensor<[?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,128],bf16> | |
torch.bind_symbolic_shape %1609, [%773], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16> | |
%int1_917 = torch.constant.int 1 | |
%int0_918 = torch.constant.int 0 | |
%int9223372036854775807_919 = torch.constant.int 9223372036854775807 | |
%int1_920 = torch.constant.int 1 | |
%1610 = torch.aten.slice.Tensor %1609, %int1_917, %int0_918, %int9223372036854775807_919, %int1_920 : !torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],bf16> | |
torch.bind_symbolic_shape %1610, [%773], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16> | |
%int2_921 = torch.constant.int 2 | |
%1611 = torch.aten.unsqueeze %1610, %int2_921 : !torch.vtensor<[1,?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,1,128],bf16> | |
torch.bind_symbolic_shape %1611, [%773], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16> | |
%int3_922 = torch.constant.int 3 | |
%int0_923 = torch.constant.int 0 | |
%int9223372036854775807_924 = torch.constant.int 9223372036854775807 | |
%int1_925 = torch.constant.int 1 | |
%1612 = torch.aten.slice.Tensor %1611, %int3_922, %int0_923, %int9223372036854775807_924, %int1_925 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,1,128],bf16> | |
torch.bind_symbolic_shape %1612, [%773], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16> | |
%int4_926 = torch.constant.int 4 | |
%int1_927 = torch.constant.int 1 | |
%int1_928 = torch.constant.int 1 | |
%int1_929 = torch.constant.int 1 | |
%1613 = torch.prim.ListConstruct %int4_926, %int1_927, %int1_928, %int1_929 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%1614 = torch.aten.repeat %1612, %1613 : !torch.vtensor<[1,?,1,128],bf16>, !torch.list<int> -> !torch.vtensor<[4,?,1,128],bf16> | |
torch.bind_symbolic_shape %1614, [%773], affine_map<()[s0] -> (4, s0 * 32, 1, 128)> : !torch.vtensor<[4,?,1,128],bf16> | |
%1615 = torch.aten.mul.Tensor %1555, %1608 : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16> -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1615, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ> | |
%int3_930 = torch.constant.int 3 | |
%int0_931 = torch.constant.int 0 | |
%int64_932 = torch.constant.int 64 | |
%int1_933 = torch.constant.int 1 | |
%1616 = torch.aten.slice.Tensor %1555, %int3_930, %int0_931, %int64_932, %int1_933 : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1616, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 64)> : !torch.vtensor<[4,?,32,64],f8E4M3FNUZ> | |
%int3_934 = torch.constant.int 3 | |
%int64_935 = torch.constant.int 64 | |
%int9223372036854775807_936 = torch.constant.int 9223372036854775807 | |
%int1_937 = torch.constant.int 1 | |
%1617 = torch.aten.slice.Tensor %1555, %int3_934, %int64_935, %int9223372036854775807_936, %int1_937 : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1617, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 64)> : !torch.vtensor<[4,?,32,64],f8E4M3FNUZ> | |
%1618 = torch.aten.neg %1617 : !torch.vtensor<[4,?,32,64],f8E4M3FNUZ> -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1618, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 64)> : !torch.vtensor<[4,?,32,64],f8E4M3FNUZ> | |
%1619 = torch.prim.ListConstruct %1618, %1616 : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>) -> !torch.list<vtensor> | |
%int-1_938 = torch.constant.int -1 | |
%1620 = torch.aten.cat %1619, %int-1_938 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1620, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ> | |
%1621 = torch.aten.mul.Tensor %1620, %1614 : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16> -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1621, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ> | |
%int1_939 = torch.constant.int 1 | |
%1622 = torch.aten.add.Tensor %1615, %1621, %int1_939 : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1622, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ> | |
%int131072_940 = torch.constant.int 131072 | |
%none_941 = torch.constant.none | |
%none_942 = torch.constant.none | |
%cpu_943 = torch.constant.device "cpu" | |
%false_944 = torch.constant.bool false | |
%1623 = torch.aten.arange %int131072_940, %none_941, %none_942, %cpu_943, %false_944 : !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[131072],si64> | |
%int0_945 = torch.constant.int 0 | |
%int128_946 = torch.constant.int 128 | |
%int2_947 = torch.constant.int 2 | |
%int4_948 = torch.constant.int 4 | |
%none_949 = torch.constant.none | |
%cpu_950 = torch.constant.device "cpu" | |
%false_951 = torch.constant.bool false | |
%1624 = torch.aten.arange.start_step %int0_945, %int128_946, %int2_947, %int4_948, %none_949, %cpu_950, %false_951 : !torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[64],si64> | |
%int6_952 = torch.constant.int 6 | |
%1625 = torch.prims.convert_element_type %1624, %int6_952 : !torch.vtensor<[64],si64>, !torch.int -> !torch.vtensor<[64],f32> | |
%int128_953 = torch.constant.int 128 | |
%1626 = torch.aten.div.Scalar %1625, %int128_953 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32> | |
%float5.000000e05_954 = torch.constant.float 5.000000e+05 | |
%1627 = torch.aten.pow.Scalar %float5.000000e05_954, %1626 : !torch.float, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32> | |
%1628 = torch.aten.reciprocal %1627 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32> | |
%float1.000000e00_955 = torch.constant.float 1.000000e+00 | |
%1629 = torch.aten.mul.Scalar %1628, %float1.000000e00_955 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],f32> | |
%1630 = torch.aten.reciprocal %1629 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32> | |
%float6.283190e00_956 = torch.constant.float 6.2831853071795862 | |
%1631 = torch.aten.mul.Scalar %1630, %float6.283190e00_956 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],f32> | |
%float8.192000e03_957 = torch.constant.float 8.192000e+03 | |
%1632 = torch.aten.gt.Scalar %1631, %float8.192000e03_957 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1> | |
%int8_958 = torch.constant.int 8 | |
%1633 = torch.aten.div.Scalar %1629, %int8_958 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32> | |
%1634 = torch.aten.where.self %1632, %1633, %1629 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32> | |
%1635 = torch.aten.reciprocal %1631 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32> | |
%int8192_959 = torch.constant.int 8192 | |
%1636 = torch.aten.mul.Scalar %1635, %int8192_959 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32> | |
%int1_960 = torch.constant.int 1 | |
%int1_961 = torch.constant.int 1 | |
%1637 = torch.aten.sub.Scalar %1636, %int1_960, %int1_961 : !torch.vtensor<[64],f32>, !torch.int, !torch.int -> !torch.vtensor<[64],f32> | |
%int3_962 = torch.constant.int 3 | |
%1638 = torch.aten.div.Scalar %1637, %int3_962 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32> | |
%int1_963 = torch.constant.int 1 | |
%int1_964 = torch.constant.int 1 | |
%1639 = torch.aten.rsub.Scalar %1638, %int1_963, %int1_964 : !torch.vtensor<[64],f32>, !torch.int, !torch.int -> !torch.vtensor<[64],f32> | |
%1640 = torch.aten.mul.Tensor %1639, %1634 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32> | |
%int8_965 = torch.constant.int 8 | |
%1641 = torch.aten.div.Scalar %1640, %int8_965 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32> | |
%1642 = torch.aten.mul.Tensor %1638, %1634 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32> | |
%int1_966 = torch.constant.int 1 | |
%1643 = torch.aten.add.Tensor %1641, %1642, %int1_966 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32> | |
%float2.048000e03_967 = torch.constant.float 2.048000e+03 | |
%1644 = torch.aten.lt.Scalar %1631, %float2.048000e03_967 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1> | |
%1645 = torch.aten.bitwise_not %1644 : !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1> | |
%float8.192000e03_968 = torch.constant.float 8.192000e+03 | |
%1646 = torch.aten.gt.Scalar %1631, %float8.192000e03_968 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1> | |
%1647 = torch.aten.bitwise_not %1646 : !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1> | |
%1648 = torch.aten.mul.Tensor %1645, %1647 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1> | |
%1649 = torch.aten.where.self %1648, %1643, %1634 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32> | |
%1650 = torch.prim.ListConstruct %1649, %1649 : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor> | |
%int-1_969 = torch.constant.int -1 | |
%1651 = torch.aten.cat %1650, %int-1_969 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[128],f32> | |
%int6_970 = torch.constant.int 6 | |
%1652 = torch.prims.convert_element_type %1651, %int6_970 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[128],f32> | |
%int1_971 = torch.constant.int 1 | |
%1653 = torch.aten.unsqueeze %1623, %int1_971 : !torch.vtensor<[131072],si64>, !torch.int -> !torch.vtensor<[131072,1],si64> | |
%int6_972 = torch.constant.int 6 | |
%1654 = torch.prims.convert_element_type %1653, %int6_972 : !torch.vtensor<[131072,1],si64>, !torch.int -> !torch.vtensor<[131072,1],f32> | |
%int0_973 = torch.constant.int 0 | |
%1655 = torch.aten.unsqueeze %1652, %int0_973 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[1,128],f32> | |
%int6_974 = torch.constant.int 6 | |
%1656 = torch.prims.convert_element_type %1655, %int6_974 : !torch.vtensor<[1,128],f32>, !torch.int -> !torch.vtensor<[1,128],f32> | |
%1657 = torch.aten.mul.Tensor %1654, %1656 : !torch.vtensor<[131072,1],f32>, !torch.vtensor<[1,128],f32> -> !torch.vtensor<[131072,128],f32> | |
%1658 = torch.aten.cos %1657 : !torch.vtensor<[131072,128],f32> -> !torch.vtensor<[131072,128],f32> | |
%int15_975 = torch.constant.int 15 | |
%1659 = torch.prims.convert_element_type %1658, %int15_975 : !torch.vtensor<[131072,128],f32>, !torch.int -> !torch.vtensor<[131072,128],bf16> | |
%1660 = torch.aten.sin %1657 : !torch.vtensor<[131072,128],f32> -> !torch.vtensor<[131072,128],f32> | |
%int15_976 = torch.constant.int 15 | |
%1661 = torch.prims.convert_element_type %1660, %int15_976 : !torch.vtensor<[131072,128],f32>, !torch.int -> !torch.vtensor<[131072,128],bf16> | |
%int0_977 = torch.constant.int 0 | |
%int0_978 = torch.constant.int 0 | |
%int1_979 = torch.constant.int 1 | |
%1662 = torch.aten.slice.Tensor %1659, %int0_977, %int0_978, %777, %int1_979 : !torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16> | |
torch.bind_symbolic_shape %1662, [%773], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16> | |
%int1_980 = torch.constant.int 1 | |
%int0_981 = torch.constant.int 0 | |
%int9223372036854775807_982 = torch.constant.int 9223372036854775807 | |
%int1_983 = torch.constant.int 1 | |
%1663 = torch.aten.slice.Tensor %1662, %int1_980, %int0_981, %int9223372036854775807_982, %int1_983 : !torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16> | |
torch.bind_symbolic_shape %1663, [%773], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16> | |
%int0_984 = torch.constant.int 0 | |
%int0_985 = torch.constant.int 0 | |
%int1_986 = torch.constant.int 1 | |
%1664 = torch.aten.slice.Tensor %1661, %int0_984, %int0_985, %777, %int1_986 : !torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16> | |
torch.bind_symbolic_shape %1664, [%773], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16> | |
%int1_987 = torch.constant.int 1 | |
%int0_988 = torch.constant.int 0 | |
%int9223372036854775807_989 = torch.constant.int 9223372036854775807 | |
%int1_990 = torch.constant.int 1 | |
%1665 = torch.aten.slice.Tensor %1664, %int1_987, %int0_988, %int9223372036854775807_989, %int1_990 : !torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16> | |
torch.bind_symbolic_shape %1665, [%773], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16> | |
%int0_991 = torch.constant.int 0 | |
%1666 = torch.aten.unsqueeze %1663, %int0_991 : !torch.vtensor<[?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,128],bf16> | |
torch.bind_symbolic_shape %1666, [%773], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16> | |
%int1_992 = torch.constant.int 1 | |
%int0_993 = torch.constant.int 0 | |
%int9223372036854775807_994 = torch.constant.int 9223372036854775807 | |
%int1_995 = torch.constant.int 1 | |
%1667 = torch.aten.slice.Tensor %1666, %int1_992, %int0_993, %int9223372036854775807_994, %int1_995 : !torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],bf16> | |
torch.bind_symbolic_shape %1667, [%773], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16> | |
%int2_996 = torch.constant.int 2 | |
%1668 = torch.aten.unsqueeze %1667, %int2_996 : !torch.vtensor<[1,?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,1,128],bf16> | |
torch.bind_symbolic_shape %1668, [%773], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16> | |
%int3_997 = torch.constant.int 3 | |
%int0_998 = torch.constant.int 0 | |
%int9223372036854775807_999 = torch.constant.int 9223372036854775807 | |
%int1_1000 = torch.constant.int 1 | |
%1669 = torch.aten.slice.Tensor %1668, %int3_997, %int0_998, %int9223372036854775807_999, %int1_1000 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,1,128],bf16> | |
torch.bind_symbolic_shape %1669, [%773], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16> | |
%int4_1001 = torch.constant.int 4 | |
%int1_1002 = torch.constant.int 1 | |
%int1_1003 = torch.constant.int 1 | |
%int1_1004 = torch.constant.int 1 | |
%1670 = torch.prim.ListConstruct %int4_1001, %int1_1002, %int1_1003, %int1_1004 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%1671 = torch.aten.repeat %1669, %1670 : !torch.vtensor<[1,?,1,128],bf16>, !torch.list<int> -> !torch.vtensor<[4,?,1,128],bf16> | |
torch.bind_symbolic_shape %1671, [%773], affine_map<()[s0] -> (4, s0 * 32, 1, 128)> : !torch.vtensor<[4,?,1,128],bf16> | |
%int0_1005 = torch.constant.int 0 | |
%1672 = torch.aten.unsqueeze %1665, %int0_1005 : !torch.vtensor<[?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,128],bf16> | |
torch.bind_symbolic_shape %1672, [%773], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16> | |
%int1_1006 = torch.constant.int 1 | |
%int0_1007 = torch.constant.int 0 | |
%int9223372036854775807_1008 = torch.constant.int 9223372036854775807 | |
%int1_1009 = torch.constant.int 1 | |
%1673 = torch.aten.slice.Tensor %1672, %int1_1006, %int0_1007, %int9223372036854775807_1008, %int1_1009 : !torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],bf16> | |
torch.bind_symbolic_shape %1673, [%773], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16> | |
%int2_1010 = torch.constant.int 2 | |
%1674 = torch.aten.unsqueeze %1673, %int2_1010 : !torch.vtensor<[1,?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,1,128],bf16> | |
torch.bind_symbolic_shape %1674, [%773], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16> | |
%int3_1011 = torch.constant.int 3 | |
%int0_1012 = torch.constant.int 0 | |
%int9223372036854775807_1013 = torch.constant.int 9223372036854775807 | |
%int1_1014 = torch.constant.int 1 | |
%1675 = torch.aten.slice.Tensor %1674, %int3_1011, %int0_1012, %int9223372036854775807_1013, %int1_1014 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,1,128],bf16> | |
torch.bind_symbolic_shape %1675, [%773], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16> | |
%int4_1015 = torch.constant.int 4 | |
%int1_1016 = torch.constant.int 1 | |
%int1_1017 = torch.constant.int 1 | |
%int1_1018 = torch.constant.int 1 | |
%1676 = torch.prim.ListConstruct %int4_1015, %int1_1016, %int1_1017, %int1_1018 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%1677 = torch.aten.repeat %1675, %1676 : !torch.vtensor<[1,?,1,128],bf16>, !torch.list<int> -> !torch.vtensor<[4,?,1,128],bf16> | |
torch.bind_symbolic_shape %1677, [%773], affine_map<()[s0] -> (4, s0 * 32, 1, 128)> : !torch.vtensor<[4,?,1,128],bf16> | |
%1678 = torch.aten.mul.Tensor %1557, %1671 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16> -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1678, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 128)> : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ> | |
%int3_1019 = torch.constant.int 3 | |
%int0_1020 = torch.constant.int 0 | |
%int64_1021 = torch.constant.int 64 | |
%int1_1022 = torch.constant.int 1 | |
%1679 = torch.aten.slice.Tensor %1557, %int3_1019, %int0_1020, %int64_1021, %int1_1022 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1679, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 64)> : !torch.vtensor<[4,?,8,64],f8E4M3FNUZ> | |
%int3_1023 = torch.constant.int 3 | |
%int64_1024 = torch.constant.int 64 | |
%int9223372036854775807_1025 = torch.constant.int 9223372036854775807 | |
%int1_1026 = torch.constant.int 1 | |
%1680 = torch.aten.slice.Tensor %1557, %int3_1023, %int64_1024, %int9223372036854775807_1025, %int1_1026 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1680, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 64)> : !torch.vtensor<[4,?,8,64],f8E4M3FNUZ> | |
%1681 = torch.aten.neg %1680 : !torch.vtensor<[4,?,8,64],f8E4M3FNUZ> -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1681, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 64)> : !torch.vtensor<[4,?,8,64],f8E4M3FNUZ> | |
%1682 = torch.prim.ListConstruct %1681, %1679 : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>) -> !torch.list<vtensor> | |
%int-1_1027 = torch.constant.int -1 | |
%1683 = torch.aten.cat %1682, %int-1_1027 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1683, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 128)> : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ> | |
%1684 = torch.aten.mul.Tensor %1683, %1677 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16> -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1684, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 128)> : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ> | |
%int1_1028 = torch.constant.int 1 | |
%1685 = torch.aten.add.Tensor %1678, %1684, %int1_1028 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1685, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 128)> : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ> | |
%int64_1029 = torch.constant.int 64 | |
%1686 = torch.aten.mul.Scalar %arg2, %int64_1029 : !torch.vtensor<[4,?],si64>, !torch.int -> !torch.vtensor<[4,?],si64> | |
torch.bind_symbolic_shape %1686, [%773], affine_map<()[s0] -> (4, s0)> : !torch.vtensor<[4,?],si64> | |
%int4_1030 = torch.constant.int 4 | |
%int1_1031 = torch.constant.int 1 | |
%1687 = torch.aten.add.Scalar %1686, %int4_1030, %int1_1031 : !torch.vtensor<[4,?],si64>, !torch.int, !torch.int -> !torch.vtensor<[4,?],si64> | |
torch.bind_symbolic_shape %1687, [%773], affine_map<()[s0] -> (4, s0)> : !torch.vtensor<[4,?],si64> | |
%int4_1032 = torch.constant.int 4 | |
%int32_1033 = torch.constant.int 32 | |
%int8_1034 = torch.constant.int 8 | |
%int128_1035 = torch.constant.int 128 | |
%1688 = torch.prim.ListConstruct %int4_1032, %775, %int32_1033, %int8_1034, %int128_1035 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%1689 = torch.aten.view %1685, %1688 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1689, [%773], affine_map<()[s0] -> (4, s0, 32, 8, 128)> : !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ> | |
%int32_1036 = torch.constant.int 32 | |
%int8_1037 = torch.constant.int 8 | |
%int128_1038 = torch.constant.int 128 | |
%1690 = torch.prim.ListConstruct %997, %int32_1036, %int8_1037, %int128_1038 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%1691 = torch.aten.view %1689, %1690 : !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1691, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
%1692 = torch.prim.ListConstruct %997 : (!torch.int) -> !torch.list<int> | |
%1693 = torch.aten.view %1687, %1692 : !torch.vtensor<[4,?],si64>, !torch.list<int> -> !torch.vtensor<[?],si64> | |
torch.bind_symbolic_shape %1693, [%773], affine_map<()[s0] -> (s0 * 4)> : !torch.vtensor<[?],si64> | |
%int26_1039 = torch.constant.int 26 | |
%1694 = torch.prims.convert_element_type %1691, %int26_1039 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1694, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
%int1_1040 = torch.constant.int 1 | |
%1695 = torch.aten.view.dtype %1694, %int1_1040 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[?,32,8,128],si8> | |
torch.bind_symbolic_shape %1695, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8> | |
%1696 = torch.aten.detach %1695 : !torch.vtensor<[?,32,8,128],si8> -> !torch.vtensor<[?,32,8,128],si8> | |
torch.bind_symbolic_shape %1696, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8> | |
%1697 = torch.aten.detach %1696 : !torch.vtensor<[?,32,8,128],si8> -> !torch.vtensor<[?,32,8,128],si8> | |
torch.bind_symbolic_shape %1697, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8> | |
%int32_1041 = torch.constant.int 32 | |
%int2_1042 = torch.constant.int 2 | |
%int32_1043 = torch.constant.int 32 | |
%int8_1044 = torch.constant.int 8 | |
%int128_1045 = torch.constant.int 128 | |
%1698 = torch.prim.ListConstruct %776, %int32_1041, %int2_1042, %int32_1043, %int8_1044, %int128_1045 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%1699 = torch.aten.view %1392, %1698 : !torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1699, [%774], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ> | |
%int32_1046 = torch.constant.int 32 | |
%int8_1047 = torch.constant.int 8 | |
%int128_1048 = torch.constant.int 128 | |
%1700 = torch.prim.ListConstruct %990, %int32_1046, %int8_1047, %int128_1048 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%1701 = torch.aten.view %1699, %1700 : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1701, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
%int1_1049 = torch.constant.int 1 | |
%1702 = torch.aten.view.dtype %1701, %int1_1049 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[?,32,8,128],si8> | |
torch.bind_symbolic_shape %1702, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8> | |
%1703 = torch.aten.detach %1702 : !torch.vtensor<[?,32,8,128],si8> -> !torch.vtensor<[?,32,8,128],si8> | |
torch.bind_symbolic_shape %1703, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8> | |
%1704 = torch.aten.detach %1703 : !torch.vtensor<[?,32,8,128],si8> -> !torch.vtensor<[?,32,8,128],si8> | |
torch.bind_symbolic_shape %1704, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8> | |
%1705 = torch.prim.ListConstruct %1693 : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>> | |
%false_1050 = torch.constant.bool false | |
%1706 = torch.aten.index_put %1704, %1705, %1697, %false_1050 : !torch.vtensor<[?,32,8,128],si8>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],si8>, !torch.bool -> !torch.vtensor<[?,32,8,128],si8> | |
torch.bind_symbolic_shape %1706, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8> | |
%int26_1051 = torch.constant.int 26 | |
%1707 = torch.aten.view.dtype %1706, %int26_1051 : !torch.vtensor<[?,32,8,128],si8>, !torch.int -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1707, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
%1708 = torch.aten.detach %1707 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1708, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
%1709 = torch.aten.detach %1708 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1709, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
%int32_1052 = torch.constant.int 32 | |
%int2_1053 = torch.constant.int 2 | |
%int32_1054 = torch.constant.int 32 | |
%int8_1055 = torch.constant.int 8 | |
%int128_1056 = torch.constant.int 128 | |
%1710 = torch.prim.ListConstruct %776, %int32_1052, %int2_1053, %int32_1054, %int8_1055, %int128_1056 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%1711 = torch.aten.view %1709, %1710 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1711, [%774], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ> | |
%int2097152_1057 = torch.constant.int 2097152 | |
%1712 = torch.prim.ListConstruct %776, %int2097152_1057 : (!torch.int, !torch.int) -> !torch.list<int> | |
%1713 = torch.aten.view %1711, %1712 : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,2097152],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1713, [%774], affine_map<()[s0] -> (s0, 2097152)> : !torch.vtensor<[?,2097152],f8E4M3FNUZ> | |
%int4_1058 = torch.constant.int 4 | |
%int32_1059 = torch.constant.int 32 | |
%int8_1060 = torch.constant.int 8 | |
%int128_1061 = torch.constant.int 128 | |
%1714 = torch.prim.ListConstruct %int4_1058, %775, %int32_1059, %int8_1060, %int128_1061 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%1715 = torch.aten.view %1559, %1714 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1715, [%773], affine_map<()[s0] -> (4, s0, 32, 8, 128)> : !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ> | |
%int32_1062 = torch.constant.int 32 | |
%int8_1063 = torch.constant.int 8 | |
%int128_1064 = torch.constant.int 128 | |
%1716 = torch.prim.ListConstruct %997, %int32_1062, %int8_1063, %int128_1064 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%1717 = torch.aten.view %1715, %1716 : !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1717, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
%int1_1065 = torch.constant.int 1 | |
%int1_1066 = torch.constant.int 1 | |
%1718 = torch.aten.add.Scalar %1687, %int1_1065, %int1_1066 : !torch.vtensor<[4,?],si64>, !torch.int, !torch.int -> !torch.vtensor<[4,?],si64> | |
torch.bind_symbolic_shape %1718, [%773], affine_map<()[s0] -> (4, s0)> : !torch.vtensor<[4,?],si64> | |
%1719 = torch.prim.ListConstruct %997 : (!torch.int) -> !torch.list<int> | |
%1720 = torch.aten.view %1718, %1719 : !torch.vtensor<[4,?],si64>, !torch.list<int> -> !torch.vtensor<[?],si64> | |
torch.bind_symbolic_shape %1720, [%773], affine_map<()[s0] -> (s0 * 4)> : !torch.vtensor<[?],si64> | |
%int26_1067 = torch.constant.int 26 | |
%1721 = torch.prims.convert_element_type %1717, %int26_1067 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1721, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
%int1_1068 = torch.constant.int 1 | |
%1722 = torch.aten.view.dtype %1721, %int1_1068 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[?,32,8,128],si8> | |
torch.bind_symbolic_shape %1722, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8> | |
%1723 = torch.aten.detach %1722 : !torch.vtensor<[?,32,8,128],si8> -> !torch.vtensor<[?,32,8,128],si8> | |
torch.bind_symbolic_shape %1723, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8> | |
%1724 = torch.aten.detach %1723 : !torch.vtensor<[?,32,8,128],si8> -> !torch.vtensor<[?,32,8,128],si8> | |
torch.bind_symbolic_shape %1724, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8> | |
%int32_1069 = torch.constant.int 32 | |
%int2_1070 = torch.constant.int 2 | |
%int32_1071 = torch.constant.int 32 | |
%int8_1072 = torch.constant.int 8 | |
%int128_1073 = torch.constant.int 128 | |
%1725 = torch.prim.ListConstruct %776, %int32_1069, %int2_1070, %int32_1071, %int8_1072, %int128_1073 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%1726 = torch.aten.view %1713, %1725 : !torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1726, [%774], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ> | |
%int32_1074 = torch.constant.int 32 | |
%int8_1075 = torch.constant.int 8 | |
%int128_1076 = torch.constant.int 128 | |
%1727 = torch.prim.ListConstruct %990, %int32_1074, %int8_1075, %int128_1076 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%1728 = torch.aten.view %1726, %1727 : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1728, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
%int1_1077 = torch.constant.int 1 | |
%1729 = torch.aten.view.dtype %1728, %int1_1077 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[?,32,8,128],si8> | |
torch.bind_symbolic_shape %1729, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8> | |
%1730 = torch.aten.detach %1729 : !torch.vtensor<[?,32,8,128],si8> -> !torch.vtensor<[?,32,8,128],si8> | |
torch.bind_symbolic_shape %1730, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8> | |
%1731 = torch.aten.detach %1730 : !torch.vtensor<[?,32,8,128],si8> -> !torch.vtensor<[?,32,8,128],si8> | |
torch.bind_symbolic_shape %1731, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8> | |
%1732 = torch.prim.ListConstruct %1720 : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>> | |
%false_1078 = torch.constant.bool false | |
%1733 = torch.aten.index_put %1731, %1732, %1724, %false_1078 : !torch.vtensor<[?,32,8,128],si8>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],si8>, !torch.bool -> !torch.vtensor<[?,32,8,128],si8> | |
torch.bind_symbolic_shape %1733, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8> | |
%int26_1079 = torch.constant.int 26 | |
%1734 = torch.aten.view.dtype %1733, %int26_1079 : !torch.vtensor<[?,32,8,128],si8>, !torch.int -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1734, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
%1735 = torch.aten.detach %1734 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1735, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
%1736 = torch.aten.detach %1735 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1736, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
%int32_1080 = torch.constant.int 32 | |
%int2_1081 = torch.constant.int 2 | |
%int32_1082 = torch.constant.int 32 | |
%int8_1083 = torch.constant.int 8 | |
%int128_1084 = torch.constant.int 128 | |
%1737 = torch.prim.ListConstruct %776, %int32_1080, %int2_1081, %int32_1082, %int8_1083, %int128_1084 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%1738 = torch.aten.view %1736, %1737 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1738, [%774], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ> | |
%int2097152_1085 = torch.constant.int 2097152 | |
%1739 = torch.prim.ListConstruct %776, %int2097152_1085 : (!torch.int, !torch.int) -> !torch.list<int> | |
%1740 = torch.aten.view %1738, %1739 : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,2097152],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1740, [%774], affine_map<()[s0] -> (s0, 2097152)> : !torch.vtensor<[?,2097152],f8E4M3FNUZ> | |
%int-2_1086 = torch.constant.int -2 | |
%1741 = torch.aten.unsqueeze %1685, %int-2_1086 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1741, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 1, 128)> : !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ> | |
%int4_1087 = torch.constant.int 4 | |
%int8_1088 = torch.constant.int 8 | |
%int4_1089 = torch.constant.int 4 | |
%int128_1090 = torch.constant.int 128 | |
%1742 = torch.prim.ListConstruct %int4_1087, %777, %int8_1088, %int4_1089, %int128_1090 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%false_1091 = torch.constant.bool false | |
%1743 = torch.aten.expand %1741, %1742, %false_1091 : !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1743, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 4, 128)> : !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ> | |
%int0_1092 = torch.constant.int 0 | |
%1744 = torch.aten.clone %1743, %int0_1092 : !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1744, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 4, 128)> : !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ> | |
%int4_1093 = torch.constant.int 4 | |
%int32_1094 = torch.constant.int 32 | |
%int128_1095 = torch.constant.int 128 | |
%1745 = torch.prim.ListConstruct %int4_1093, %777, %int32_1094, %int128_1095 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%1746 = torch.aten._unsafe_view %1744, %1745 : !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1746, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ> | |
%int-2_1096 = torch.constant.int -2 | |
%1747 = torch.aten.unsqueeze %1559, %int-2_1096 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1747, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 1, 128)> : !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ> | |
%int4_1097 = torch.constant.int 4 | |
%int8_1098 = torch.constant.int 8 | |
%int4_1099 = torch.constant.int 4 | |
%int128_1100 = torch.constant.int 128 | |
%1748 = torch.prim.ListConstruct %int4_1097, %777, %int8_1098, %int4_1099, %int128_1100 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%false_1101 = torch.constant.bool false | |
%1749 = torch.aten.expand %1747, %1748, %false_1101 : !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1749, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 4, 128)> : !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ> | |
%int0_1102 = torch.constant.int 0 | |
%1750 = torch.aten.clone %1749, %int0_1102 : !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1750, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 4, 128)> : !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ> | |
%int4_1103 = torch.constant.int 4 | |
%int32_1104 = torch.constant.int 32 | |
%int128_1105 = torch.constant.int 128 | |
%1751 = torch.prim.ListConstruct %int4_1103, %777, %int32_1104, %int128_1105 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%1752 = torch.aten._unsafe_view %1750, %1751 : !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1752, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ> | |
%int1_1106 = torch.constant.int 1 | |
%int2_1107 = torch.constant.int 2 | |
%1753 = torch.aten.transpose.int %1622, %int1_1106, %int2_1107 : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1753, [%773], affine_map<()[s0] -> (4, 32, s0 * 32, 128)> : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> | |
%int1_1108 = torch.constant.int 1 | |
%int2_1109 = torch.constant.int 2 | |
%1754 = torch.aten.transpose.int %1746, %int1_1108, %int2_1109 : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1754, [%773], affine_map<()[s0] -> (4, 32, s0 * 32, 128)> : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> | |
%int1_1110 = torch.constant.int 1 | |
%int2_1111 = torch.constant.int 2 | |
%1755 = torch.aten.transpose.int %1752, %int1_1110, %int2_1111 : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1755, [%773], affine_map<()[s0] -> (4, 32, s0 * 32, 128)> : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> | |
%int26_1112 = torch.constant.int 26 | |
%1756 = torch.prims.convert_element_type %1753, %int26_1112 : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1756, [%773], affine_map<()[s0] -> (4, 32, s0 * 32, 128)> : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> | |
%int26_1113 = torch.constant.int 26 | |
%1757 = torch.prims.convert_element_type %1754, %int26_1113 : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1757, [%773], affine_map<()[s0] -> (4, 32, s0 * 32, 128)> : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> | |
%int26_1114 = torch.constant.int 26 | |
%1758 = torch.prims.convert_element_type %1755, %int26_1114 : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1758, [%773], affine_map<()[s0] -> (4, 32, s0 * 32, 128)> : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> | |
%int26_1115 = torch.constant.int 26 | |
%1759 = torch.prims.convert_element_type %803, %int26_1115 : !torch.vtensor<[4,1,?,?],f32>, !torch.int -> !torch.vtensor<[4,1,?,?],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1759, [%773], affine_map<()[s0] -> (4, 1, s0 * 32, s0 * 32)> : !torch.vtensor<[4,1,?,?],f8E4M3FNUZ> | |
%int0_1116 = torch.constant.int 0 | |
%int0_1117 = torch.constant.int 0 | |
%1760 = torch.aten.select.int %1759, %int0_1116, %int0_1117 : !torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[1,?,?],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1760, [%773], affine_map<()[s0] -> (1, s0 * 32, s0 * 32)> : !torch.vtensor<[1,?,?],f8E4M3FNUZ> | |
%int0_1118 = torch.constant.int 0 | |
%int0_1119 = torch.constant.int 0 | |
%1761 = torch.aten.select.int %1760, %int0_1118, %int0_1119 : !torch.vtensor<[1,?,?],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[?,?],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1761, [%773], affine_map<()[s0] -> (s0 * 32, s0 * 32)> : !torch.vtensor<[?,?],f8E4M3FNUZ> | |
%int0_1120 = torch.constant.int 0 | |
%int0_1121 = torch.constant.int 0 | |
%int9223372036854775807_1122 = torch.constant.int 9223372036854775807 | |
%int1_1123 = torch.constant.int 1 | |
%1762 = torch.aten.slice.Tensor %1761, %int0_1120, %int0_1121, %int9223372036854775807_1122, %int1_1123 : !torch.vtensor<[?,?],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,?],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1762, [%773], affine_map<()[s0] -> (s0 * 32, s0 * 32)> : !torch.vtensor<[?,?],f8E4M3FNUZ> | |
%int1_1124 = torch.constant.int 1 | |
%int0_1125 = torch.constant.int 0 | |
%int9223372036854775807_1126 = torch.constant.int 9223372036854775807 | |
%int1_1127 = torch.constant.int 1 | |
%1763 = torch.aten.slice.Tensor %1762, %int1_1124, %int0_1125, %int9223372036854775807_1126, %int1_1127 : !torch.vtensor<[?,?],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,?],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1763, [%773], affine_map<()[s0] -> (s0 * 32, s0 * 32)> : !torch.vtensor<[?,?],f8E4M3FNUZ> | |
%none_1128 = torch.constant.none | |
%1764 = torch.aten.clone %59, %none_1128 : !torch.vtensor<[],f32>, !torch.none -> !torch.vtensor<[],f32> | |
%1765 = torch.aten.detach %1764 : !torch.vtensor<[],f32> -> !torch.vtensor<[],f32> | |
%1766 = torch.aten.detach %1765 : !torch.vtensor<[],f32> -> !torch.vtensor<[],f32> | |
%1767 = torch.aten.detach %1766 : !torch.vtensor<[],f32> -> !torch.vtensor<[],f32> | |
%1768 = torch_c.to_builtin_tensor %1756 : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> -> tensor<4x32x?x128xf8E4M3FNUZ> | |
%1769 = torch_c.to_builtin_tensor %1757 : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> -> tensor<4x32x?x128xf8E4M3FNUZ> | |
%1770 = torch_c.to_builtin_tensor %1758 : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> -> tensor<4x32x?x128xf8E4M3FNUZ> | |
%1771 = torch_c.to_builtin_tensor %1763 : !torch.vtensor<[?,?],f8E4M3FNUZ> -> tensor<?x?xf8E4M3FNUZ> | |
%1772 = torch_c.to_builtin_tensor %1767 : !torch.vtensor<[],f32> -> tensor<f32> | |
%1773 = util.call @sharktank_masked_flash_attention_4_32_128_128_f8E4M3FNUZ_f8E4M3FNUZ_f32_f32(%1768, %1769, %1770, %1772, %1771) : (tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<f32>, tensor<?x?xf8E4M3FNUZ>) -> tensor<4x32x?x128xf32> | |
%1774 = torch_c.from_builtin_tensor %1773 : tensor<4x32x?x128xf32> -> !torch.vtensor<[4,32,?,128],f32> | |
torch.bind_symbolic_shape %1774, [%773], affine_map<()[s0] -> (4, 32, s0 * 32, 128)> : !torch.vtensor<[4,32,?,128],f32> | |
%int1_1129 = torch.constant.int 1 | |
%int2_1130 = torch.constant.int 2 | |
%1775 = torch.aten.transpose.int %1774, %int1_1129, %int2_1130 : !torch.vtensor<[4,32,?,128],f32>, !torch.int, !torch.int -> !torch.vtensor<[4,?,32,128],f32> | |
torch.bind_symbolic_shape %1775, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f32> | |
%int0_1131 = torch.constant.int 0 | |
%1776 = torch.aten.clone %1775, %int0_1131 : !torch.vtensor<[4,?,32,128],f32>, !torch.int -> !torch.vtensor<[4,?,32,128],f32> | |
torch.bind_symbolic_shape %1776, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f32> | |
%int4_1132 = torch.constant.int 4 | |
%int4096_1133 = torch.constant.int 4096 | |
%1777 = torch.prim.ListConstruct %int4_1132, %777, %int4096_1133 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%1778 = torch.aten._unsafe_view %1776, %1777 : !torch.vtensor<[4,?,32,128],f32>, !torch.list<int> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %1778, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%1779 = torch.aten.div.Tensor %1778, %60 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %1779, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%float-2.400000e02_1134 = torch.constant.float -2.400000e+02 | |
%float2.400000e02_1135 = torch.constant.float 2.400000e+02 | |
%1780 = torch.aten.clamp %1779, %float-2.400000e02_1134, %float2.400000e02_1135 : !torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %1780, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int26_1136 = torch.constant.int 26 | |
%1781 = torch.prims.convert_element_type %1780, %int26_1136 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1781, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> | |
%int0_1137 = torch.constant.int 0 | |
%1782 = torch.aten.unsqueeze %61, %int0_1137 : !torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ> | |
%int4_1138 = torch.constant.int 4 | |
%int4096_1139 = torch.constant.int 4096 | |
%int4096_1140 = torch.constant.int 4096 | |
%1783 = torch.prim.ListConstruct %int4_1138, %int4096_1139, %int4096_1140 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%false_1141 = torch.constant.bool false | |
%1784 = torch.aten.expand %1782, %1783, %false_1141 : !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ> | |
%1785 = torch_c.to_builtin_tensor %1781 : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> -> tensor<4x?x4096xf8E4M3FNUZ> | |
%1786 = torch_c.to_builtin_tensor %1784 : !torch.vtensor<[4,4096,4096],f8E4M3FNUZ> -> tensor<4x4096x4096xf8E4M3FNUZ> | |
%1787 = util.call @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ_f32(%1785, %1786) : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x?x4096xf32> | |
%1788 = torch_c.from_builtin_tensor %1787 : tensor<4x?x4096xf32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %1788, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%1789 = torch.prim.ListConstruct : () -> !torch.list<int> | |
%1790 = torch.aten.permute %62, %1789 : !torch.vtensor<[],f32>, !torch.list<int> -> !torch.vtensor<[],f32> | |
%1791 = torch.aten.mul.Tensor %60, %1790 : !torch.vtensor<[],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[],f32> | |
%int6_1142 = torch.constant.int 6 | |
%1792 = torch.prims.convert_element_type %1788, %int6_1142 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %1792, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%1793 = torch.aten.mul.Tensor %1792, %1791 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %1793, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int1_1143 = torch.constant.int 1 | |
%1794 = torch.aten.add.Tensor %1504, %1793, %int1_1143 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %1794, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int6_1144 = torch.constant.int 6 | |
%1795 = torch.prims.convert_element_type %1794, %int6_1144 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %1795, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int2_1145 = torch.constant.int 2 | |
%1796 = torch.aten.pow.Tensor_Scalar %1795, %int2_1145 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %1796, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int-1_1146 = torch.constant.int -1 | |
%1797 = torch.prim.ListConstruct %int-1_1146 : (!torch.int) -> !torch.list<int> | |
%true_1147 = torch.constant.bool true | |
%none_1148 = torch.constant.none | |
%1798 = torch.aten.mean.dim %1796, %1797, %true_1147, %none_1148 : !torch.vtensor<[4,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[4,?,1],f32> | |
torch.bind_symbolic_shape %1798, [%773], affine_map<()[s0] -> (4, s0 * 32, 1)> : !torch.vtensor<[4,?,1],f32> | |
%float1.000000e-05_1149 = torch.constant.float 1.000000e-05 | |
%int1_1150 = torch.constant.int 1 | |
%1799 = torch.aten.add.Scalar %1798, %float1.000000e-05_1149, %int1_1150 : !torch.vtensor<[4,?,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[4,?,1],f32> | |
torch.bind_symbolic_shape %1799, [%773], affine_map<()[s0] -> (4, s0 * 32, 1)> : !torch.vtensor<[4,?,1],f32> | |
%1800 = torch.aten.rsqrt %1799 : !torch.vtensor<[4,?,1],f32> -> !torch.vtensor<[4,?,1],f32> | |
torch.bind_symbolic_shape %1800, [%773], affine_map<()[s0] -> (4, s0 * 32, 1)> : !torch.vtensor<[4,?,1],f32> | |
%1801 = torch.aten.mul.Tensor %1795, %1800 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,1],f32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %1801, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int6_1151 = torch.constant.int 6 | |
%1802 = torch.prims.convert_element_type %1801, %int6_1151 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %1802, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%1803 = torch.aten.mul.Tensor %63, %1802 : !torch.vtensor<[4096],bf16>, !torch.vtensor<[4,?,4096],f32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %1803, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int6_1152 = torch.constant.int 6 | |
%1804 = torch.prims.convert_element_type %1803, %int6_1152 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %1804, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%1805 = torch.aten.div.Tensor %1804, %64 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %1805, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%float-2.400000e02_1153 = torch.constant.float -2.400000e+02 | |
%float2.400000e02_1154 = torch.constant.float 2.400000e+02 | |
%1806 = torch.aten.clamp %1805, %float-2.400000e02_1153, %float2.400000e02_1154 : !torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %1806, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int26_1155 = torch.constant.int 26 | |
%1807 = torch.prims.convert_element_type %1806, %int26_1155 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1807, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> | |
%int0_1156 = torch.constant.int 0 | |
%1808 = torch.aten.unsqueeze %65, %int0_1156 : !torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ> | |
%int4_1157 = torch.constant.int 4 | |
%int14336_1158 = torch.constant.int 14336 | |
%int4096_1159 = torch.constant.int 4096 | |
%1809 = torch.prim.ListConstruct %int4_1157, %int14336_1158, %int4096_1159 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%false_1160 = torch.constant.bool false | |
%1810 = torch.aten.expand %1808, %1809, %false_1160 : !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ> | |
%1811 = torch_c.to_builtin_tensor %1807 : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> -> tensor<4x?x4096xf8E4M3FNUZ> | |
%1812 = torch_c.to_builtin_tensor %1810 : !torch.vtensor<[4,14336,4096],f8E4M3FNUZ> -> tensor<4x14336x4096xf8E4M3FNUZ> | |
%1813 = util.call @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ_f32(%1811, %1812) : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x?x14336xf32> | |
%1814 = torch_c.from_builtin_tensor %1813 : tensor<4x?x14336xf32> -> !torch.vtensor<[4,?,14336],f32> | |
torch.bind_symbolic_shape %1814, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32> | |
%1815 = torch.prim.ListConstruct : () -> !torch.list<int> | |
%1816 = torch.aten.permute %66, %1815 : !torch.vtensor<[],f32>, !torch.list<int> -> !torch.vtensor<[],f32> | |
%1817 = torch.aten.mul.Tensor %64, %1816 : !torch.vtensor<[],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[],f32> | |
%int6_1161 = torch.constant.int 6 | |
%1818 = torch.prims.convert_element_type %1814, %int6_1161 : !torch.vtensor<[4,?,14336],f32>, !torch.int -> !torch.vtensor<[4,?,14336],f32> | |
torch.bind_symbolic_shape %1818, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32> | |
%1819 = torch.aten.mul.Tensor %1818, %1817 : !torch.vtensor<[4,?,14336],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,14336],f32> | |
torch.bind_symbolic_shape %1819, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32> | |
%1820 = torch.aten.silu %1819 : !torch.vtensor<[4,?,14336],f32> -> !torch.vtensor<[4,?,14336],f32> | |
torch.bind_symbolic_shape %1820, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32> | |
%1821 = torch.aten.div.Tensor %1804, %67 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %1821, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%float-2.400000e02_1162 = torch.constant.float -2.400000e+02 | |
%float2.400000e02_1163 = torch.constant.float 2.400000e+02 | |
%1822 = torch.aten.clamp %1821, %float-2.400000e02_1162, %float2.400000e02_1163 : !torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %1822, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int26_1164 = torch.constant.int 26 | |
%1823 = torch.prims.convert_element_type %1822, %int26_1164 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1823, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> | |
%int0_1165 = torch.constant.int 0 | |
%1824 = torch.aten.unsqueeze %68, %int0_1165 : !torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ> | |
%int4_1166 = torch.constant.int 4 | |
%int14336_1167 = torch.constant.int 14336 | |
%int4096_1168 = torch.constant.int 4096 | |
%1825 = torch.prim.ListConstruct %int4_1166, %int14336_1167, %int4096_1168 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%false_1169 = torch.constant.bool false | |
%1826 = torch.aten.expand %1824, %1825, %false_1169 : !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ> | |
%1827 = torch_c.to_builtin_tensor %1823 : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> -> tensor<4x?x4096xf8E4M3FNUZ> | |
%1828 = torch_c.to_builtin_tensor %1826 : !torch.vtensor<[4,14336,4096],f8E4M3FNUZ> -> tensor<4x14336x4096xf8E4M3FNUZ> | |
%1829 = util.call @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ_f32(%1827, %1828) : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x?x14336xf32> | |
%1830 = torch_c.from_builtin_tensor %1829 : tensor<4x?x14336xf32> -> !torch.vtensor<[4,?,14336],f32> | |
torch.bind_symbolic_shape %1830, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32> | |
%1831 = torch.prim.ListConstruct : () -> !torch.list<int> | |
%1832 = torch.aten.permute %69, %1831 : !torch.vtensor<[],f32>, !torch.list<int> -> !torch.vtensor<[],f32> | |
%1833 = torch.aten.mul.Tensor %67, %1832 : !torch.vtensor<[],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[],f32> | |
%int6_1170 = torch.constant.int 6 | |
%1834 = torch.prims.convert_element_type %1830, %int6_1170 : !torch.vtensor<[4,?,14336],f32>, !torch.int -> !torch.vtensor<[4,?,14336],f32> | |
torch.bind_symbolic_shape %1834, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32> | |
%1835 = torch.aten.mul.Tensor %1834, %1833 : !torch.vtensor<[4,?,14336],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,14336],f32> | |
torch.bind_symbolic_shape %1835, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32> | |
%1836 = torch.aten.mul.Tensor %1820, %1835 : !torch.vtensor<[4,?,14336],f32>, !torch.vtensor<[4,?,14336],f32> -> !torch.vtensor<[4,?,14336],f32> | |
torch.bind_symbolic_shape %1836, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32> | |
%1837 = torch.aten.div.Tensor %1836, %70 : !torch.vtensor<[4,?,14336],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,14336],f32> | |
torch.bind_symbolic_shape %1837, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32> | |
%float-2.400000e02_1171 = torch.constant.float -2.400000e+02 | |
%float2.400000e02_1172 = torch.constant.float 2.400000e+02 | |
%1838 = torch.aten.clamp %1837, %float-2.400000e02_1171, %float2.400000e02_1172 : !torch.vtensor<[4,?,14336],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,14336],f32> | |
torch.bind_symbolic_shape %1838, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32> | |
%int26_1173 = torch.constant.int 26 | |
%1839 = torch.prims.convert_element_type %1838, %int26_1173 : !torch.vtensor<[4,?,14336],f32>, !torch.int -> !torch.vtensor<[4,?,14336],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1839, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f8E4M3FNUZ> | |
%int0_1174 = torch.constant.int 0 | |
%1840 = torch.aten.unsqueeze %71, %int0_1174 : !torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,4096,14336],f8E4M3FNUZ> | |
%int4_1175 = torch.constant.int 4 | |
%int4096_1176 = torch.constant.int 4096 | |
%int14336_1177 = torch.constant.int 14336 | |
%1841 = torch.prim.ListConstruct %int4_1175, %int4096_1176, %int14336_1177 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%false_1178 = torch.constant.bool false | |
%1842 = torch.aten.expand %1840, %1841, %false_1178 : !torch.vtensor<[1,4096,14336],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,4096,14336],f8E4M3FNUZ> | |
%1843 = torch_c.to_builtin_tensor %1839 : !torch.vtensor<[4,?,14336],f8E4M3FNUZ> -> tensor<4x?x14336xf8E4M3FNUZ> | |
%1844 = torch_c.to_builtin_tensor %1842 : !torch.vtensor<[4,4096,14336],f8E4M3FNUZ> -> tensor<4x4096x14336xf8E4M3FNUZ> | |
%1845 = util.call @sharktank_batch_matmul_transpose_b_L4xDx14336xf8E4M3FNUZ_R4x4096x14336xf8E4M3FNUZ_f32(%1843, %1844) : (tensor<4x?x14336xf8E4M3FNUZ>, tensor<4x4096x14336xf8E4M3FNUZ>) -> tensor<4x?x4096xf32> | |
%1846 = torch_c.from_builtin_tensor %1845 : tensor<4x?x4096xf32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %1846, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%1847 = torch.prim.ListConstruct : () -> !torch.list<int> | |
%1848 = torch.aten.permute %72, %1847 : !torch.vtensor<[],f32>, !torch.list<int> -> !torch.vtensor<[],f32> | |
%1849 = torch.aten.mul.Tensor %70, %1848 : !torch.vtensor<[],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[],f32> | |
%int6_1179 = torch.constant.int 6 | |
%1850 = torch.prims.convert_element_type %1846, %int6_1179 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %1850, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%1851 = torch.aten.mul.Tensor %1850, %1849 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %1851, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int1_1180 = torch.constant.int 1 | |
%1852 = torch.aten.add.Tensor %1794, %1851, %int1_1180 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %1852, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int6_1181 = torch.constant.int 6 | |
%1853 = torch.prims.convert_element_type %1852, %int6_1181 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %1853, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int2_1182 = torch.constant.int 2 | |
%1854 = torch.aten.pow.Tensor_Scalar %1853, %int2_1182 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %1854, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int-1_1183 = torch.constant.int -1 | |
%1855 = torch.prim.ListConstruct %int-1_1183 : (!torch.int) -> !torch.list<int> | |
%true_1184 = torch.constant.bool true | |
%none_1185 = torch.constant.none | |
%1856 = torch.aten.mean.dim %1854, %1855, %true_1184, %none_1185 : !torch.vtensor<[4,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[4,?,1],f32> | |
torch.bind_symbolic_shape %1856, [%773], affine_map<()[s0] -> (4, s0 * 32, 1)> : !torch.vtensor<[4,?,1],f32> | |
%float1.000000e-05_1186 = torch.constant.float 1.000000e-05 | |
%int1_1187 = torch.constant.int 1 | |
%1857 = torch.aten.add.Scalar %1856, %float1.000000e-05_1186, %int1_1187 : !torch.vtensor<[4,?,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[4,?,1],f32> | |
torch.bind_symbolic_shape %1857, [%773], affine_map<()[s0] -> (4, s0 * 32, 1)> : !torch.vtensor<[4,?,1],f32> | |
%1858 = torch.aten.rsqrt %1857 : !torch.vtensor<[4,?,1],f32> -> !torch.vtensor<[4,?,1],f32> | |
torch.bind_symbolic_shape %1858, [%773], affine_map<()[s0] -> (4, s0 * 32, 1)> : !torch.vtensor<[4,?,1],f32> | |
%1859 = torch.aten.mul.Tensor %1853, %1858 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,1],f32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %1859, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int6_1188 = torch.constant.int 6 | |
%1860 = torch.prims.convert_element_type %1859, %int6_1188 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %1860, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%1861 = torch.aten.mul.Tensor %73, %1860 : !torch.vtensor<[4096],bf16>, !torch.vtensor<[4,?,4096],f32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %1861, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int6_1189 = torch.constant.int 6 | |
%1862 = torch.prims.convert_element_type %1861, %int6_1189 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %1862, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%1863 = torch.aten.div.Tensor %1862, %74 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %1863, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%float-2.400000e02_1190 = torch.constant.float -2.400000e+02 | |
%float2.400000e02_1191 = torch.constant.float 2.400000e+02 | |
%1864 = torch.aten.clamp %1863, %float-2.400000e02_1190, %float2.400000e02_1191 : !torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %1864, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int26_1192 = torch.constant.int 26 | |
%1865 = torch.prims.convert_element_type %1864, %int26_1192 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1865, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> | |
%int0_1193 = torch.constant.int 0 | |
%1866 = torch.aten.unsqueeze %75, %int0_1193 : !torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ> | |
%int4_1194 = torch.constant.int 4 | |
%int4096_1195 = torch.constant.int 4096 | |
%int4096_1196 = torch.constant.int 4096 | |
%1867 = torch.prim.ListConstruct %int4_1194, %int4096_1195, %int4096_1196 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%false_1197 = torch.constant.bool false | |
%1868 = torch.aten.expand %1866, %1867, %false_1197 : !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ> | |
%1869 = torch_c.to_builtin_tensor %1865 : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> -> tensor<4x?x4096xf8E4M3FNUZ> | |
%1870 = torch_c.to_builtin_tensor %1868 : !torch.vtensor<[4,4096,4096],f8E4M3FNUZ> -> tensor<4x4096x4096xf8E4M3FNUZ> | |
%1871 = util.call @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ_f32(%1869, %1870) : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x?x4096xf32> | |
%1872 = torch_c.from_builtin_tensor %1871 : tensor<4x?x4096xf32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %1872, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%1873 = torch.aten.div.Tensor %1872, %76 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %1873, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%float-2.400000e02_1198 = torch.constant.float -2.400000e+02 | |
%float2.400000e02_1199 = torch.constant.float 2.400000e+02 | |
%1874 = torch.aten.clamp %1873, %float-2.400000e02_1198, %float2.400000e02_1199 : !torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %1874, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int26_1200 = torch.constant.int 26 | |
%1875 = torch.prims.convert_element_type %1874, %int26_1200 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1875, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> | |
%1876 = torch.aten.div.Tensor %1862, %77 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %1876, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%float-2.400000e02_1201 = torch.constant.float -2.400000e+02 | |
%float2.400000e02_1202 = torch.constant.float 2.400000e+02 | |
%1877 = torch.aten.clamp %1876, %float-2.400000e02_1201, %float2.400000e02_1202 : !torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %1877, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int26_1203 = torch.constant.int 26 | |
%1878 = torch.prims.convert_element_type %1877, %int26_1203 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1878, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> | |
%int0_1204 = torch.constant.int 0 | |
%1879 = torch.aten.unsqueeze %78, %int0_1204 : !torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ> | |
%int4_1205 = torch.constant.int 4 | |
%int1024_1206 = torch.constant.int 1024 | |
%int4096_1207 = torch.constant.int 4096 | |
%1880 = torch.prim.ListConstruct %int4_1205, %int1024_1206, %int4096_1207 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%false_1208 = torch.constant.bool false | |
%1881 = torch.aten.expand %1879, %1880, %false_1208 : !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ> | |
%1882 = torch_c.to_builtin_tensor %1878 : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> -> tensor<4x?x4096xf8E4M3FNUZ> | |
%1883 = torch_c.to_builtin_tensor %1881 : !torch.vtensor<[4,1024,4096],f8E4M3FNUZ> -> tensor<4x1024x4096xf8E4M3FNUZ> | |
%1884 = util.call @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ_f32(%1882, %1883) : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x?x1024xf32> | |
%1885 = torch_c.from_builtin_tensor %1884 : tensor<4x?x1024xf32> -> !torch.vtensor<[4,?,1024],f32> | |
torch.bind_symbolic_shape %1885, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f32> | |
%1886 = torch.aten.div.Tensor %1885, %79 : !torch.vtensor<[4,?,1024],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,1024],f32> | |
torch.bind_symbolic_shape %1886, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f32> | |
%float-2.400000e02_1209 = torch.constant.float -2.400000e+02 | |
%float2.400000e02_1210 = torch.constant.float 2.400000e+02 | |
%1887 = torch.aten.clamp %1886, %float-2.400000e02_1209, %float2.400000e02_1210 : !torch.vtensor<[4,?,1024],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,1024],f32> | |
torch.bind_symbolic_shape %1887, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f32> | |
%int26_1211 = torch.constant.int 26 | |
%1888 = torch.prims.convert_element_type %1887, %int26_1211 : !torch.vtensor<[4,?,1024],f32>, !torch.int -> !torch.vtensor<[4,?,1024],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1888, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f8E4M3FNUZ> | |
%1889 = torch.aten.div.Tensor %1862, %80 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %1889, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%float-2.400000e02_1212 = torch.constant.float -2.400000e+02 | |
%float2.400000e02_1213 = torch.constant.float 2.400000e+02 | |
%1890 = torch.aten.clamp %1889, %float-2.400000e02_1212, %float2.400000e02_1213 : !torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %1890, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int26_1214 = torch.constant.int 26 | |
%1891 = torch.prims.convert_element_type %1890, %int26_1214 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1891, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> | |
%int0_1215 = torch.constant.int 0 | |
%1892 = torch.aten.unsqueeze %81, %int0_1215 : !torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ> | |
%int4_1216 = torch.constant.int 4 | |
%int1024_1217 = torch.constant.int 1024 | |
%int4096_1218 = torch.constant.int 4096 | |
%1893 = torch.prim.ListConstruct %int4_1216, %int1024_1217, %int4096_1218 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%false_1219 = torch.constant.bool false | |
%1894 = torch.aten.expand %1892, %1893, %false_1219 : !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ> | |
%1895 = torch_c.to_builtin_tensor %1891 : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> -> tensor<4x?x4096xf8E4M3FNUZ> | |
%1896 = torch_c.to_builtin_tensor %1894 : !torch.vtensor<[4,1024,4096],f8E4M3FNUZ> -> tensor<4x1024x4096xf8E4M3FNUZ> | |
%1897 = util.call @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ_f32(%1895, %1896) : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x?x1024xf32> | |
%1898 = torch_c.from_builtin_tensor %1897 : tensor<4x?x1024xf32> -> !torch.vtensor<[4,?,1024],f32> | |
torch.bind_symbolic_shape %1898, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f32> | |
%1899 = torch.aten.div.Tensor %1898, %82 : !torch.vtensor<[4,?,1024],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,1024],f32> | |
torch.bind_symbolic_shape %1899, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f32> | |
%float-2.400000e02_1220 = torch.constant.float -2.400000e+02 | |
%float2.400000e02_1221 = torch.constant.float 2.400000e+02 | |
%1900 = torch.aten.clamp %1899, %float-2.400000e02_1220, %float2.400000e02_1221 : !torch.vtensor<[4,?,1024],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,1024],f32> | |
torch.bind_symbolic_shape %1900, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f32> | |
%int26_1222 = torch.constant.int 26 | |
%1901 = torch.prims.convert_element_type %1900, %int26_1222 : !torch.vtensor<[4,?,1024],f32>, !torch.int -> !torch.vtensor<[4,?,1024],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1901, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f8E4M3FNUZ> | |
%int4_1223 = torch.constant.int 4 | |
%int32_1224 = torch.constant.int 32 | |
%int128_1225 = torch.constant.int 128 | |
%1902 = torch.prim.ListConstruct %int4_1223, %777, %int32_1224, %int128_1225 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%1903 = torch.aten.view %1875, %1902 : !torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1903, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ> | |
%int4_1226 = torch.constant.int 4 | |
%int8_1227 = torch.constant.int 8 | |
%int128_1228 = torch.constant.int 128 | |
%1904 = torch.prim.ListConstruct %int4_1226, %777, %int8_1227, %int128_1228 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%1905 = torch.aten.view %1888, %1904 : !torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1905, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 128)> : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ> | |
%int4_1229 = torch.constant.int 4 | |
%int8_1230 = torch.constant.int 8 | |
%int128_1231 = torch.constant.int 128 | |
%1906 = torch.prim.ListConstruct %int4_1229, %777, %int8_1230, %int128_1231 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%1907 = torch.aten.view %1901, %1906 : !torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1907, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 128)> : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ> | |
%int131072_1232 = torch.constant.int 131072 | |
%none_1233 = torch.constant.none | |
%none_1234 = torch.constant.none | |
%cpu_1235 = torch.constant.device "cpu" | |
%false_1236 = torch.constant.bool false | |
%1908 = torch.aten.arange %int131072_1232, %none_1233, %none_1234, %cpu_1235, %false_1236 : !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[131072],si64> | |
%int0_1237 = torch.constant.int 0 | |
%int128_1238 = torch.constant.int 128 | |
%int2_1239 = torch.constant.int 2 | |
%int4_1240 = torch.constant.int 4 | |
%none_1241 = torch.constant.none | |
%cpu_1242 = torch.constant.device "cpu" | |
%false_1243 = torch.constant.bool false | |
%1909 = torch.aten.arange.start_step %int0_1237, %int128_1238, %int2_1239, %int4_1240, %none_1241, %cpu_1242, %false_1243 : !torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[64],si64> | |
%int6_1244 = torch.constant.int 6 | |
%1910 = torch.prims.convert_element_type %1909, %int6_1244 : !torch.vtensor<[64],si64>, !torch.int -> !torch.vtensor<[64],f32> | |
%int128_1245 = torch.constant.int 128 | |
%1911 = torch.aten.div.Scalar %1910, %int128_1245 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32> | |
%float5.000000e05_1246 = torch.constant.float 5.000000e+05 | |
%1912 = torch.aten.pow.Scalar %float5.000000e05_1246, %1911 : !torch.float, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32> | |
%1913 = torch.aten.reciprocal %1912 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32> | |
%float1.000000e00_1247 = torch.constant.float 1.000000e+00 | |
%1914 = torch.aten.mul.Scalar %1913, %float1.000000e00_1247 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],f32> | |
%1915 = torch.aten.reciprocal %1914 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32> | |
%float6.283190e00_1248 = torch.constant.float 6.2831853071795862 | |
%1916 = torch.aten.mul.Scalar %1915, %float6.283190e00_1248 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],f32> | |
%float8.192000e03_1249 = torch.constant.float 8.192000e+03 | |
%1917 = torch.aten.gt.Scalar %1916, %float8.192000e03_1249 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1> | |
%int8_1250 = torch.constant.int 8 | |
%1918 = torch.aten.div.Scalar %1914, %int8_1250 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32> | |
%1919 = torch.aten.where.self %1917, %1918, %1914 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32> | |
%1920 = torch.aten.reciprocal %1916 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32> | |
%int8192_1251 = torch.constant.int 8192 | |
%1921 = torch.aten.mul.Scalar %1920, %int8192_1251 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32> | |
%int1_1252 = torch.constant.int 1 | |
%int1_1253 = torch.constant.int 1 | |
%1922 = torch.aten.sub.Scalar %1921, %int1_1252, %int1_1253 : !torch.vtensor<[64],f32>, !torch.int, !torch.int -> !torch.vtensor<[64],f32> | |
%int3_1254 = torch.constant.int 3 | |
%1923 = torch.aten.div.Scalar %1922, %int3_1254 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32> | |
%int1_1255 = torch.constant.int 1 | |
%int1_1256 = torch.constant.int 1 | |
%1924 = torch.aten.rsub.Scalar %1923, %int1_1255, %int1_1256 : !torch.vtensor<[64],f32>, !torch.int, !torch.int -> !torch.vtensor<[64],f32> | |
%1925 = torch.aten.mul.Tensor %1924, %1919 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32> | |
%int8_1257 = torch.constant.int 8 | |
%1926 = torch.aten.div.Scalar %1925, %int8_1257 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32> | |
%1927 = torch.aten.mul.Tensor %1923, %1919 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32> | |
%int1_1258 = torch.constant.int 1 | |
%1928 = torch.aten.add.Tensor %1926, %1927, %int1_1258 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32> | |
%float2.048000e03_1259 = torch.constant.float 2.048000e+03 | |
%1929 = torch.aten.lt.Scalar %1916, %float2.048000e03_1259 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1> | |
%1930 = torch.aten.bitwise_not %1929 : !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1> | |
%float8.192000e03_1260 = torch.constant.float 8.192000e+03 | |
%1931 = torch.aten.gt.Scalar %1916, %float8.192000e03_1260 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1> | |
%1932 = torch.aten.bitwise_not %1931 : !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1> | |
%1933 = torch.aten.mul.Tensor %1930, %1932 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1> | |
%1934 = torch.aten.where.self %1933, %1928, %1919 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32> | |
%1935 = torch.prim.ListConstruct %1934, %1934 : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor> | |
%int-1_1261 = torch.constant.int -1 | |
%1936 = torch.aten.cat %1935, %int-1_1261 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[128],f32> | |
%int6_1262 = torch.constant.int 6 | |
%1937 = torch.prims.convert_element_type %1936, %int6_1262 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[128],f32> | |
%int1_1263 = torch.constant.int 1 | |
%1938 = torch.aten.unsqueeze %1908, %int1_1263 : !torch.vtensor<[131072],si64>, !torch.int -> !torch.vtensor<[131072,1],si64> | |
%int6_1264 = torch.constant.int 6 | |
%1939 = torch.prims.convert_element_type %1938, %int6_1264 : !torch.vtensor<[131072,1],si64>, !torch.int -> !torch.vtensor<[131072,1],f32> | |
%int0_1265 = torch.constant.int 0 | |
%1940 = torch.aten.unsqueeze %1937, %int0_1265 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[1,128],f32> | |
%int6_1266 = torch.constant.int 6 | |
%1941 = torch.prims.convert_element_type %1940, %int6_1266 : !torch.vtensor<[1,128],f32>, !torch.int -> !torch.vtensor<[1,128],f32> | |
%1942 = torch.aten.mul.Tensor %1939, %1941 : !torch.vtensor<[131072,1],f32>, !torch.vtensor<[1,128],f32> -> !torch.vtensor<[131072,128],f32> | |
%1943 = torch.aten.cos %1942 : !torch.vtensor<[131072,128],f32> -> !torch.vtensor<[131072,128],f32> | |
%int15_1267 = torch.constant.int 15 | |
%1944 = torch.prims.convert_element_type %1943, %int15_1267 : !torch.vtensor<[131072,128],f32>, !torch.int -> !torch.vtensor<[131072,128],bf16> | |
%1945 = torch.aten.sin %1942 : !torch.vtensor<[131072,128],f32> -> !torch.vtensor<[131072,128],f32> | |
%int15_1268 = torch.constant.int 15 | |
%1946 = torch.prims.convert_element_type %1945, %int15_1268 : !torch.vtensor<[131072,128],f32>, !torch.int -> !torch.vtensor<[131072,128],bf16> | |
%int0_1269 = torch.constant.int 0 | |
%int0_1270 = torch.constant.int 0 | |
%int1_1271 = torch.constant.int 1 | |
%1947 = torch.aten.slice.Tensor %1944, %int0_1269, %int0_1270, %777, %int1_1271 : !torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16> | |
torch.bind_symbolic_shape %1947, [%773], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16> | |
%int1_1272 = torch.constant.int 1 | |
%int0_1273 = torch.constant.int 0 | |
%int9223372036854775807_1274 = torch.constant.int 9223372036854775807 | |
%int1_1275 = torch.constant.int 1 | |
%1948 = torch.aten.slice.Tensor %1947, %int1_1272, %int0_1273, %int9223372036854775807_1274, %int1_1275 : !torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16> | |
torch.bind_symbolic_shape %1948, [%773], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16> | |
%int0_1276 = torch.constant.int 0 | |
%int0_1277 = torch.constant.int 0 | |
%int1_1278 = torch.constant.int 1 | |
%1949 = torch.aten.slice.Tensor %1946, %int0_1276, %int0_1277, %777, %int1_1278 : !torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16> | |
torch.bind_symbolic_shape %1949, [%773], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16> | |
%int1_1279 = torch.constant.int 1 | |
%int0_1280 = torch.constant.int 0 | |
%int9223372036854775807_1281 = torch.constant.int 9223372036854775807 | |
%int1_1282 = torch.constant.int 1 | |
%1950 = torch.aten.slice.Tensor %1949, %int1_1279, %int0_1280, %int9223372036854775807_1281, %int1_1282 : !torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16> | |
torch.bind_symbolic_shape %1950, [%773], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16> | |
%int0_1283 = torch.constant.int 0 | |
%1951 = torch.aten.unsqueeze %1948, %int0_1283 : !torch.vtensor<[?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,128],bf16> | |
torch.bind_symbolic_shape %1951, [%773], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16> | |
%int1_1284 = torch.constant.int 1 | |
%int0_1285 = torch.constant.int 0 | |
%int9223372036854775807_1286 = torch.constant.int 9223372036854775807 | |
%int1_1287 = torch.constant.int 1 | |
%1952 = torch.aten.slice.Tensor %1951, %int1_1284, %int0_1285, %int9223372036854775807_1286, %int1_1287 : !torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],bf16> | |
torch.bind_symbolic_shape %1952, [%773], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16> | |
%int2_1288 = torch.constant.int 2 | |
%1953 = torch.aten.unsqueeze %1952, %int2_1288 : !torch.vtensor<[1,?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,1,128],bf16> | |
torch.bind_symbolic_shape %1953, [%773], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16> | |
%int3_1289 = torch.constant.int 3 | |
%int0_1290 = torch.constant.int 0 | |
%int9223372036854775807_1291 = torch.constant.int 9223372036854775807 | |
%int1_1292 = torch.constant.int 1 | |
%1954 = torch.aten.slice.Tensor %1953, %int3_1289, %int0_1290, %int9223372036854775807_1291, %int1_1292 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,1,128],bf16> | |
torch.bind_symbolic_shape %1954, [%773], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16> | |
%int4_1293 = torch.constant.int 4 | |
%int1_1294 = torch.constant.int 1 | |
%int1_1295 = torch.constant.int 1 | |
%int1_1296 = torch.constant.int 1 | |
%1955 = torch.prim.ListConstruct %int4_1293, %int1_1294, %int1_1295, %int1_1296 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%1956 = torch.aten.repeat %1954, %1955 : !torch.vtensor<[1,?,1,128],bf16>, !torch.list<int> -> !torch.vtensor<[4,?,1,128],bf16> | |
torch.bind_symbolic_shape %1956, [%773], affine_map<()[s0] -> (4, s0 * 32, 1, 128)> : !torch.vtensor<[4,?,1,128],bf16> | |
%int0_1297 = torch.constant.int 0 | |
%1957 = torch.aten.unsqueeze %1950, %int0_1297 : !torch.vtensor<[?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,128],bf16> | |
torch.bind_symbolic_shape %1957, [%773], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16> | |
%int1_1298 = torch.constant.int 1 | |
%int0_1299 = torch.constant.int 0 | |
%int9223372036854775807_1300 = torch.constant.int 9223372036854775807 | |
%int1_1301 = torch.constant.int 1 | |
%1958 = torch.aten.slice.Tensor %1957, %int1_1298, %int0_1299, %int9223372036854775807_1300, %int1_1301 : !torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],bf16> | |
torch.bind_symbolic_shape %1958, [%773], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16> | |
%int2_1302 = torch.constant.int 2 | |
%1959 = torch.aten.unsqueeze %1958, %int2_1302 : !torch.vtensor<[1,?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,1,128],bf16> | |
torch.bind_symbolic_shape %1959, [%773], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16> | |
%int3_1303 = torch.constant.int 3 | |
%int0_1304 = torch.constant.int 0 | |
%int9223372036854775807_1305 = torch.constant.int 9223372036854775807 | |
%int1_1306 = torch.constant.int 1 | |
%1960 = torch.aten.slice.Tensor %1959, %int3_1303, %int0_1304, %int9223372036854775807_1305, %int1_1306 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,1,128],bf16> | |
torch.bind_symbolic_shape %1960, [%773], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16> | |
%int4_1307 = torch.constant.int 4 | |
%int1_1308 = torch.constant.int 1 | |
%int1_1309 = torch.constant.int 1 | |
%int1_1310 = torch.constant.int 1 | |
%1961 = torch.prim.ListConstruct %int4_1307, %int1_1308, %int1_1309, %int1_1310 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%1962 = torch.aten.repeat %1960, %1961 : !torch.vtensor<[1,?,1,128],bf16>, !torch.list<int> -> !torch.vtensor<[4,?,1,128],bf16> | |
torch.bind_symbolic_shape %1962, [%773], affine_map<()[s0] -> (4, s0 * 32, 1, 128)> : !torch.vtensor<[4,?,1,128],bf16> | |
%1963 = torch.aten.mul.Tensor %1903, %1956 : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16> -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1963, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ> | |
%int3_1311 = torch.constant.int 3 | |
%int0_1312 = torch.constant.int 0 | |
%int64_1313 = torch.constant.int 64 | |
%int1_1314 = torch.constant.int 1 | |
%1964 = torch.aten.slice.Tensor %1903, %int3_1311, %int0_1312, %int64_1313, %int1_1314 : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1964, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 64)> : !torch.vtensor<[4,?,32,64],f8E4M3FNUZ> | |
%int3_1315 = torch.constant.int 3 | |
%int64_1316 = torch.constant.int 64 | |
%int9223372036854775807_1317 = torch.constant.int 9223372036854775807 | |
%int1_1318 = torch.constant.int 1 | |
%1965 = torch.aten.slice.Tensor %1903, %int3_1315, %int64_1316, %int9223372036854775807_1317, %int1_1318 : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1965, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 64)> : !torch.vtensor<[4,?,32,64],f8E4M3FNUZ> | |
%1966 = torch.aten.neg %1965 : !torch.vtensor<[4,?,32,64],f8E4M3FNUZ> -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1966, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 64)> : !torch.vtensor<[4,?,32,64],f8E4M3FNUZ> | |
%1967 = torch.prim.ListConstruct %1966, %1964 : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>) -> !torch.list<vtensor> | |
%int-1_1319 = torch.constant.int -1 | |
%1968 = torch.aten.cat %1967, %int-1_1319 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1968, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ> | |
%1969 = torch.aten.mul.Tensor %1968, %1962 : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16> -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1969, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ> | |
%int1_1320 = torch.constant.int 1 | |
%1970 = torch.aten.add.Tensor %1963, %1969, %int1_1320 : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %1970, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ> | |
%int131072_1321 = torch.constant.int 131072 | |
%none_1322 = torch.constant.none | |
%none_1323 = torch.constant.none | |
%cpu_1324 = torch.constant.device "cpu" | |
%false_1325 = torch.constant.bool false | |
%1971 = torch.aten.arange %int131072_1321, %none_1322, %none_1323, %cpu_1324, %false_1325 : !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[131072],si64> | |
%int0_1326 = torch.constant.int 0 | |
%int128_1327 = torch.constant.int 128 | |
%int2_1328 = torch.constant.int 2 | |
%int4_1329 = torch.constant.int 4 | |
%none_1330 = torch.constant.none | |
%cpu_1331 = torch.constant.device "cpu" | |
%false_1332 = torch.constant.bool false | |
%1972 = torch.aten.arange.start_step %int0_1326, %int128_1327, %int2_1328, %int4_1329, %none_1330, %cpu_1331, %false_1332 : !torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[64],si64> | |
%int6_1333 = torch.constant.int 6 | |
%1973 = torch.prims.convert_element_type %1972, %int6_1333 : !torch.vtensor<[64],si64>, !torch.int -> !torch.vtensor<[64],f32> | |
%int128_1334 = torch.constant.int 128 | |
%1974 = torch.aten.div.Scalar %1973, %int128_1334 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32> | |
%float5.000000e05_1335 = torch.constant.float 5.000000e+05 | |
%1975 = torch.aten.pow.Scalar %float5.000000e05_1335, %1974 : !torch.float, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32> | |
%1976 = torch.aten.reciprocal %1975 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32> | |
%float1.000000e00_1336 = torch.constant.float 1.000000e+00 | |
%1977 = torch.aten.mul.Scalar %1976, %float1.000000e00_1336 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],f32> | |
%1978 = torch.aten.reciprocal %1977 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32> | |
%float6.283190e00_1337 = torch.constant.float 6.2831853071795862 | |
%1979 = torch.aten.mul.Scalar %1978, %float6.283190e00_1337 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],f32> | |
%float8.192000e03_1338 = torch.constant.float 8.192000e+03 | |
%1980 = torch.aten.gt.Scalar %1979, %float8.192000e03_1338 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1> | |
%int8_1339 = torch.constant.int 8 | |
%1981 = torch.aten.div.Scalar %1977, %int8_1339 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32> | |
%1982 = torch.aten.where.self %1980, %1981, %1977 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32> | |
%1983 = torch.aten.reciprocal %1979 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32> | |
%int8192_1340 = torch.constant.int 8192 | |
%1984 = torch.aten.mul.Scalar %1983, %int8192_1340 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32> | |
%int1_1341 = torch.constant.int 1 | |
%int1_1342 = torch.constant.int 1 | |
%1985 = torch.aten.sub.Scalar %1984, %int1_1341, %int1_1342 : !torch.vtensor<[64],f32>, !torch.int, !torch.int -> !torch.vtensor<[64],f32> | |
%int3_1343 = torch.constant.int 3 | |
%1986 = torch.aten.div.Scalar %1985, %int3_1343 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32> | |
%int1_1344 = torch.constant.int 1 | |
%int1_1345 = torch.constant.int 1 | |
%1987 = torch.aten.rsub.Scalar %1986, %int1_1344, %int1_1345 : !torch.vtensor<[64],f32>, !torch.int, !torch.int -> !torch.vtensor<[64],f32> | |
%1988 = torch.aten.mul.Tensor %1987, %1982 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32> | |
%int8_1346 = torch.constant.int 8 | |
%1989 = torch.aten.div.Scalar %1988, %int8_1346 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32> | |
%1990 = torch.aten.mul.Tensor %1986, %1982 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32> | |
%int1_1347 = torch.constant.int 1 | |
%1991 = torch.aten.add.Tensor %1989, %1990, %int1_1347 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32> | |
%float2.048000e03_1348 = torch.constant.float 2.048000e+03 | |
%1992 = torch.aten.lt.Scalar %1979, %float2.048000e03_1348 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1> | |
%1993 = torch.aten.bitwise_not %1992 : !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1> | |
%float8.192000e03_1349 = torch.constant.float 8.192000e+03 | |
%1994 = torch.aten.gt.Scalar %1979, %float8.192000e03_1349 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1> | |
%1995 = torch.aten.bitwise_not %1994 : !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1> | |
%1996 = torch.aten.mul.Tensor %1993, %1995 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1> | |
%1997 = torch.aten.where.self %1996, %1991, %1982 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32> | |
%1998 = torch.prim.ListConstruct %1997, %1997 : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor> | |
%int-1_1350 = torch.constant.int -1 | |
%1999 = torch.aten.cat %1998, %int-1_1350 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[128],f32> | |
%int6_1351 = torch.constant.int 6 | |
%2000 = torch.prims.convert_element_type %1999, %int6_1351 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[128],f32> | |
%int1_1352 = torch.constant.int 1 | |
%2001 = torch.aten.unsqueeze %1971, %int1_1352 : !torch.vtensor<[131072],si64>, !torch.int -> !torch.vtensor<[131072,1],si64> | |
%int6_1353 = torch.constant.int 6 | |
%2002 = torch.prims.convert_element_type %2001, %int6_1353 : !torch.vtensor<[131072,1],si64>, !torch.int -> !torch.vtensor<[131072,1],f32> | |
%int0_1354 = torch.constant.int 0 | |
%2003 = torch.aten.unsqueeze %2000, %int0_1354 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[1,128],f32> | |
%int6_1355 = torch.constant.int 6 | |
%2004 = torch.prims.convert_element_type %2003, %int6_1355 : !torch.vtensor<[1,128],f32>, !torch.int -> !torch.vtensor<[1,128],f32> | |
%2005 = torch.aten.mul.Tensor %2002, %2004 : !torch.vtensor<[131072,1],f32>, !torch.vtensor<[1,128],f32> -> !torch.vtensor<[131072,128],f32> | |
%2006 = torch.aten.cos %2005 : !torch.vtensor<[131072,128],f32> -> !torch.vtensor<[131072,128],f32> | |
%int15_1356 = torch.constant.int 15 | |
%2007 = torch.prims.convert_element_type %2006, %int15_1356 : !torch.vtensor<[131072,128],f32>, !torch.int -> !torch.vtensor<[131072,128],bf16> | |
%2008 = torch.aten.sin %2005 : !torch.vtensor<[131072,128],f32> -> !torch.vtensor<[131072,128],f32> | |
%int15_1357 = torch.constant.int 15 | |
%2009 = torch.prims.convert_element_type %2008, %int15_1357 : !torch.vtensor<[131072,128],f32>, !torch.int -> !torch.vtensor<[131072,128],bf16> | |
%int0_1358 = torch.constant.int 0 | |
%int0_1359 = torch.constant.int 0 | |
%int1_1360 = torch.constant.int 1 | |
%2010 = torch.aten.slice.Tensor %2007, %int0_1358, %int0_1359, %777, %int1_1360 : !torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16> | |
torch.bind_symbolic_shape %2010, [%773], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16> | |
%int1_1361 = torch.constant.int 1 | |
%int0_1362 = torch.constant.int 0 | |
%int9223372036854775807_1363 = torch.constant.int 9223372036854775807 | |
%int1_1364 = torch.constant.int 1 | |
%2011 = torch.aten.slice.Tensor %2010, %int1_1361, %int0_1362, %int9223372036854775807_1363, %int1_1364 : !torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16> | |
torch.bind_symbolic_shape %2011, [%773], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16> | |
%int0_1365 = torch.constant.int 0 | |
%int0_1366 = torch.constant.int 0 | |
%int1_1367 = torch.constant.int 1 | |
%2012 = torch.aten.slice.Tensor %2009, %int0_1365, %int0_1366, %777, %int1_1367 : !torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16> | |
torch.bind_symbolic_shape %2012, [%773], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16> | |
%int1_1368 = torch.constant.int 1 | |
%int0_1369 = torch.constant.int 0 | |
%int9223372036854775807_1370 = torch.constant.int 9223372036854775807 | |
%int1_1371 = torch.constant.int 1 | |
%2013 = torch.aten.slice.Tensor %2012, %int1_1368, %int0_1369, %int9223372036854775807_1370, %int1_1371 : !torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16> | |
torch.bind_symbolic_shape %2013, [%773], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16> | |
%int0_1372 = torch.constant.int 0 | |
%2014 = torch.aten.unsqueeze %2011, %int0_1372 : !torch.vtensor<[?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,128],bf16> | |
torch.bind_symbolic_shape %2014, [%773], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16> | |
%int1_1373 = torch.constant.int 1 | |
%int0_1374 = torch.constant.int 0 | |
%int9223372036854775807_1375 = torch.constant.int 9223372036854775807 | |
%int1_1376 = torch.constant.int 1 | |
%2015 = torch.aten.slice.Tensor %2014, %int1_1373, %int0_1374, %int9223372036854775807_1375, %int1_1376 : !torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],bf16> | |
torch.bind_symbolic_shape %2015, [%773], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16> | |
%int2_1377 = torch.constant.int 2 | |
%2016 = torch.aten.unsqueeze %2015, %int2_1377 : !torch.vtensor<[1,?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,1,128],bf16> | |
torch.bind_symbolic_shape %2016, [%773], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16> | |
%int3_1378 = torch.constant.int 3 | |
%int0_1379 = torch.constant.int 0 | |
%int9223372036854775807_1380 = torch.constant.int 9223372036854775807 | |
%int1_1381 = torch.constant.int 1 | |
%2017 = torch.aten.slice.Tensor %2016, %int3_1378, %int0_1379, %int9223372036854775807_1380, %int1_1381 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,1,128],bf16> | |
torch.bind_symbolic_shape %2017, [%773], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16> | |
%int4_1382 = torch.constant.int 4 | |
%int1_1383 = torch.constant.int 1 | |
%int1_1384 = torch.constant.int 1 | |
%int1_1385 = torch.constant.int 1 | |
%2018 = torch.prim.ListConstruct %int4_1382, %int1_1383, %int1_1384, %int1_1385 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%2019 = torch.aten.repeat %2017, %2018 : !torch.vtensor<[1,?,1,128],bf16>, !torch.list<int> -> !torch.vtensor<[4,?,1,128],bf16> | |
torch.bind_symbolic_shape %2019, [%773], affine_map<()[s0] -> (4, s0 * 32, 1, 128)> : !torch.vtensor<[4,?,1,128],bf16> | |
%int0_1386 = torch.constant.int 0 | |
%2020 = torch.aten.unsqueeze %2013, %int0_1386 : !torch.vtensor<[?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,128],bf16> | |
torch.bind_symbolic_shape %2020, [%773], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16> | |
%int1_1387 = torch.constant.int 1 | |
%int0_1388 = torch.constant.int 0 | |
%int9223372036854775807_1389 = torch.constant.int 9223372036854775807 | |
%int1_1390 = torch.constant.int 1 | |
%2021 = torch.aten.slice.Tensor %2020, %int1_1387, %int0_1388, %int9223372036854775807_1389, %int1_1390 : !torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],bf16> | |
torch.bind_symbolic_shape %2021, [%773], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16> | |
%int2_1391 = torch.constant.int 2 | |
%2022 = torch.aten.unsqueeze %2021, %int2_1391 : !torch.vtensor<[1,?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,1,128],bf16> | |
torch.bind_symbolic_shape %2022, [%773], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16> | |
%int3_1392 = torch.constant.int 3 | |
%int0_1393 = torch.constant.int 0 | |
%int9223372036854775807_1394 = torch.constant.int 9223372036854775807 | |
%int1_1395 = torch.constant.int 1 | |
%2023 = torch.aten.slice.Tensor %2022, %int3_1392, %int0_1393, %int9223372036854775807_1394, %int1_1395 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,1,128],bf16> | |
torch.bind_symbolic_shape %2023, [%773], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16> | |
%int4_1396 = torch.constant.int 4 | |
%int1_1397 = torch.constant.int 1 | |
%int1_1398 = torch.constant.int 1 | |
%int1_1399 = torch.constant.int 1 | |
%2024 = torch.prim.ListConstruct %int4_1396, %int1_1397, %int1_1398, %int1_1399 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%2025 = torch.aten.repeat %2023, %2024 : !torch.vtensor<[1,?,1,128],bf16>, !torch.list<int> -> !torch.vtensor<[4,?,1,128],bf16> | |
torch.bind_symbolic_shape %2025, [%773], affine_map<()[s0] -> (4, s0 * 32, 1, 128)> : !torch.vtensor<[4,?,1,128],bf16> | |
%2026 = torch.aten.mul.Tensor %1905, %2019 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16> -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2026, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 128)> : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ> | |
%int3_1400 = torch.constant.int 3 | |
%int0_1401 = torch.constant.int 0 | |
%int64_1402 = torch.constant.int 64 | |
%int1_1403 = torch.constant.int 1 | |
%2027 = torch.aten.slice.Tensor %1905, %int3_1400, %int0_1401, %int64_1402, %int1_1403 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2027, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 64)> : !torch.vtensor<[4,?,8,64],f8E4M3FNUZ> | |
%int3_1404 = torch.constant.int 3 | |
%int64_1405 = torch.constant.int 64 | |
%int9223372036854775807_1406 = torch.constant.int 9223372036854775807 | |
%int1_1407 = torch.constant.int 1 | |
%2028 = torch.aten.slice.Tensor %1905, %int3_1404, %int64_1405, %int9223372036854775807_1406, %int1_1407 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2028, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 64)> : !torch.vtensor<[4,?,8,64],f8E4M3FNUZ> | |
%2029 = torch.aten.neg %2028 : !torch.vtensor<[4,?,8,64],f8E4M3FNUZ> -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2029, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 64)> : !torch.vtensor<[4,?,8,64],f8E4M3FNUZ> | |
%2030 = torch.prim.ListConstruct %2029, %2027 : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>) -> !torch.list<vtensor> | |
%int-1_1408 = torch.constant.int -1 | |
%2031 = torch.aten.cat %2030, %int-1_1408 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2031, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 128)> : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ> | |
%2032 = torch.aten.mul.Tensor %2031, %2025 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16> -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2032, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 128)> : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ> | |
%int1_1409 = torch.constant.int 1 | |
%2033 = torch.aten.add.Tensor %2026, %2032, %int1_1409 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2033, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 128)> : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ> | |
%int64_1410 = torch.constant.int 64 | |
%2034 = torch.aten.mul.Scalar %arg2, %int64_1410 : !torch.vtensor<[4,?],si64>, !torch.int -> !torch.vtensor<[4,?],si64> | |
torch.bind_symbolic_shape %2034, [%773], affine_map<()[s0] -> (4, s0)> : !torch.vtensor<[4,?],si64> | |
%int6_1411 = torch.constant.int 6 | |
%int1_1412 = torch.constant.int 1 | |
%2035 = torch.aten.add.Scalar %2034, %int6_1411, %int1_1412 : !torch.vtensor<[4,?],si64>, !torch.int, !torch.int -> !torch.vtensor<[4,?],si64> | |
torch.bind_symbolic_shape %2035, [%773], affine_map<()[s0] -> (4, s0)> : !torch.vtensor<[4,?],si64> | |
%int4_1413 = torch.constant.int 4 | |
%int32_1414 = torch.constant.int 32 | |
%int8_1415 = torch.constant.int 8 | |
%int128_1416 = torch.constant.int 128 | |
%2036 = torch.prim.ListConstruct %int4_1413, %775, %int32_1414, %int8_1415, %int128_1416 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%2037 = torch.aten.view %2033, %2036 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2037, [%773], affine_map<()[s0] -> (4, s0, 32, 8, 128)> : !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ> | |
%int32_1417 = torch.constant.int 32 | |
%int8_1418 = torch.constant.int 8 | |
%int128_1419 = torch.constant.int 128 | |
%2038 = torch.prim.ListConstruct %997, %int32_1417, %int8_1418, %int128_1419 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%2039 = torch.aten.view %2037, %2038 : !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2039, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
%2040 = torch.prim.ListConstruct %997 : (!torch.int) -> !torch.list<int> | |
%2041 = torch.aten.view %2035, %2040 : !torch.vtensor<[4,?],si64>, !torch.list<int> -> !torch.vtensor<[?],si64> | |
torch.bind_symbolic_shape %2041, [%773], affine_map<()[s0] -> (s0 * 4)> : !torch.vtensor<[?],si64> | |
%int26_1420 = torch.constant.int 26 | |
%2042 = torch.prims.convert_element_type %2039, %int26_1420 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2042, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
%int1_1421 = torch.constant.int 1 | |
%2043 = torch.aten.view.dtype %2042, %int1_1421 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[?,32,8,128],si8> | |
torch.bind_symbolic_shape %2043, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8> | |
%2044 = torch.aten.detach %2043 : !torch.vtensor<[?,32,8,128],si8> -> !torch.vtensor<[?,32,8,128],si8> | |
torch.bind_symbolic_shape %2044, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8> | |
%2045 = torch.aten.detach %2044 : !torch.vtensor<[?,32,8,128],si8> -> !torch.vtensor<[?,32,8,128],si8> | |
torch.bind_symbolic_shape %2045, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8> | |
%int32_1422 = torch.constant.int 32 | |
%int2_1423 = torch.constant.int 2 | |
%int32_1424 = torch.constant.int 32 | |
%int8_1425 = torch.constant.int 8 | |
%int128_1426 = torch.constant.int 128 | |
%2046 = torch.prim.ListConstruct %776, %int32_1422, %int2_1423, %int32_1424, %int8_1425, %int128_1426 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%2047 = torch.aten.view %1740, %2046 : !torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2047, [%774], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ> | |
%int32_1427 = torch.constant.int 32 | |
%int8_1428 = torch.constant.int 8 | |
%int128_1429 = torch.constant.int 128 | |
%2048 = torch.prim.ListConstruct %990, %int32_1427, %int8_1428, %int128_1429 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%2049 = torch.aten.view %2047, %2048 : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2049, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
%int1_1430 = torch.constant.int 1 | |
%2050 = torch.aten.view.dtype %2049, %int1_1430 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[?,32,8,128],si8> | |
torch.bind_symbolic_shape %2050, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8> | |
%2051 = torch.aten.detach %2050 : !torch.vtensor<[?,32,8,128],si8> -> !torch.vtensor<[?,32,8,128],si8> | |
torch.bind_symbolic_shape %2051, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8> | |
%2052 = torch.aten.detach %2051 : !torch.vtensor<[?,32,8,128],si8> -> !torch.vtensor<[?,32,8,128],si8> | |
torch.bind_symbolic_shape %2052, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8> | |
%2053 = torch.prim.ListConstruct %2041 : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>> | |
%false_1431 = torch.constant.bool false | |
%2054 = torch.aten.index_put %2052, %2053, %2045, %false_1431 : !torch.vtensor<[?,32,8,128],si8>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],si8>, !torch.bool -> !torch.vtensor<[?,32,8,128],si8> | |
torch.bind_symbolic_shape %2054, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8> | |
%int26_1432 = torch.constant.int 26 | |
%2055 = torch.aten.view.dtype %2054, %int26_1432 : !torch.vtensor<[?,32,8,128],si8>, !torch.int -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2055, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
%2056 = torch.aten.detach %2055 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2056, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
%2057 = torch.aten.detach %2056 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2057, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
%int32_1433 = torch.constant.int 32 | |
%int2_1434 = torch.constant.int 2 | |
%int32_1435 = torch.constant.int 32 | |
%int8_1436 = torch.constant.int 8 | |
%int128_1437 = torch.constant.int 128 | |
%2058 = torch.prim.ListConstruct %776, %int32_1433, %int2_1434, %int32_1435, %int8_1436, %int128_1437 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%2059 = torch.aten.view %2057, %2058 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2059, [%774], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ> | |
%int2097152_1438 = torch.constant.int 2097152 | |
%2060 = torch.prim.ListConstruct %776, %int2097152_1438 : (!torch.int, !torch.int) -> !torch.list<int> | |
%2061 = torch.aten.view %2059, %2060 : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,2097152],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2061, [%774], affine_map<()[s0] -> (s0, 2097152)> : !torch.vtensor<[?,2097152],f8E4M3FNUZ> | |
%int4_1439 = torch.constant.int 4 | |
%int32_1440 = torch.constant.int 32 | |
%int8_1441 = torch.constant.int 8 | |
%int128_1442 = torch.constant.int 128 | |
%2062 = torch.prim.ListConstruct %int4_1439, %775, %int32_1440, %int8_1441, %int128_1442 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%2063 = torch.aten.view %1907, %2062 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2063, [%773], affine_map<()[s0] -> (4, s0, 32, 8, 128)> : !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ> | |
%int32_1443 = torch.constant.int 32 | |
%int8_1444 = torch.constant.int 8 | |
%int128_1445 = torch.constant.int 128 | |
%2064 = torch.prim.ListConstruct %997, %int32_1443, %int8_1444, %int128_1445 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%2065 = torch.aten.view %2063, %2064 : !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2065, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
%int1_1446 = torch.constant.int 1 | |
%int1_1447 = torch.constant.int 1 | |
%2066 = torch.aten.add.Scalar %2035, %int1_1446, %int1_1447 : !torch.vtensor<[4,?],si64>, !torch.int, !torch.int -> !torch.vtensor<[4,?],si64> | |
torch.bind_symbolic_shape %2066, [%773], affine_map<()[s0] -> (4, s0)> : !torch.vtensor<[4,?],si64> | |
%2067 = torch.prim.ListConstruct %997 : (!torch.int) -> !torch.list<int> | |
%2068 = torch.aten.view %2066, %2067 : !torch.vtensor<[4,?],si64>, !torch.list<int> -> !torch.vtensor<[?],si64> | |
torch.bind_symbolic_shape %2068, [%773], affine_map<()[s0] -> (s0 * 4)> : !torch.vtensor<[?],si64> | |
%int26_1448 = torch.constant.int 26 | |
%2069 = torch.prims.convert_element_type %2065, %int26_1448 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2069, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
%int1_1449 = torch.constant.int 1 | |
%2070 = torch.aten.view.dtype %2069, %int1_1449 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[?,32,8,128],si8> | |
torch.bind_symbolic_shape %2070, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8> | |
%2071 = torch.aten.detach %2070 : !torch.vtensor<[?,32,8,128],si8> -> !torch.vtensor<[?,32,8,128],si8> | |
torch.bind_symbolic_shape %2071, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8> | |
%2072 = torch.aten.detach %2071 : !torch.vtensor<[?,32,8,128],si8> -> !torch.vtensor<[?,32,8,128],si8> | |
torch.bind_symbolic_shape %2072, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8> | |
%int32_1450 = torch.constant.int 32 | |
%int2_1451 = torch.constant.int 2 | |
%int32_1452 = torch.constant.int 32 | |
%int8_1453 = torch.constant.int 8 | |
%int128_1454 = torch.constant.int 128 | |
%2073 = torch.prim.ListConstruct %776, %int32_1450, %int2_1451, %int32_1452, %int8_1453, %int128_1454 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%2074 = torch.aten.view %2061, %2073 : !torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2074, [%774], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ> | |
%int32_1455 = torch.constant.int 32 | |
%int8_1456 = torch.constant.int 8 | |
%int128_1457 = torch.constant.int 128 | |
%2075 = torch.prim.ListConstruct %990, %int32_1455, %int8_1456, %int128_1457 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%2076 = torch.aten.view %2074, %2075 : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2076, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
%int1_1458 = torch.constant.int 1 | |
%2077 = torch.aten.view.dtype %2076, %int1_1458 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[?,32,8,128],si8> | |
torch.bind_symbolic_shape %2077, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8> | |
%2078 = torch.aten.detach %2077 : !torch.vtensor<[?,32,8,128],si8> -> !torch.vtensor<[?,32,8,128],si8> | |
torch.bind_symbolic_shape %2078, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8> | |
%2079 = torch.aten.detach %2078 : !torch.vtensor<[?,32,8,128],si8> -> !torch.vtensor<[?,32,8,128],si8> | |
torch.bind_symbolic_shape %2079, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8> | |
%2080 = torch.prim.ListConstruct %2068 : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>> | |
%false_1459 = torch.constant.bool false | |
%2081 = torch.aten.index_put %2079, %2080, %2072, %false_1459 : !torch.vtensor<[?,32,8,128],si8>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],si8>, !torch.bool -> !torch.vtensor<[?,32,8,128],si8> | |
torch.bind_symbolic_shape %2081, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8> | |
%int26_1460 = torch.constant.int 26 | |
%2082 = torch.aten.view.dtype %2081, %int26_1460 : !torch.vtensor<[?,32,8,128],si8>, !torch.int -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2082, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
%2083 = torch.aten.detach %2082 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2083, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
%2084 = torch.aten.detach %2083 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2084, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
%int32_1461 = torch.constant.int 32 | |
%int2_1462 = torch.constant.int 2 | |
%int32_1463 = torch.constant.int 32 | |
%int8_1464 = torch.constant.int 8 | |
%int128_1465 = torch.constant.int 128 | |
%2085 = torch.prim.ListConstruct %776, %int32_1461, %int2_1462, %int32_1463, %int8_1464, %int128_1465 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%2086 = torch.aten.view %2084, %2085 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2086, [%774], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ> | |
%int2097152_1466 = torch.constant.int 2097152 | |
%2087 = torch.prim.ListConstruct %776, %int2097152_1466 : (!torch.int, !torch.int) -> !torch.list<int> | |
%2088 = torch.aten.view %2086, %2087 : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,2097152],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2088, [%774], affine_map<()[s0] -> (s0, 2097152)> : !torch.vtensor<[?,2097152],f8E4M3FNUZ> | |
%int-2_1467 = torch.constant.int -2 | |
%2089 = torch.aten.unsqueeze %2033, %int-2_1467 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2089, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 1, 128)> : !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ> | |
%int4_1468 = torch.constant.int 4 | |
%int8_1469 = torch.constant.int 8 | |
%int4_1470 = torch.constant.int 4 | |
%int128_1471 = torch.constant.int 128 | |
%2090 = torch.prim.ListConstruct %int4_1468, %777, %int8_1469, %int4_1470, %int128_1471 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%false_1472 = torch.constant.bool false | |
%2091 = torch.aten.expand %2089, %2090, %false_1472 : !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2091, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 4, 128)> : !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ> | |
%int0_1473 = torch.constant.int 0 | |
%2092 = torch.aten.clone %2091, %int0_1473 : !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2092, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 4, 128)> : !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ> | |
%int4_1474 = torch.constant.int 4 | |
%int32_1475 = torch.constant.int 32 | |
%int128_1476 = torch.constant.int 128 | |
%2093 = torch.prim.ListConstruct %int4_1474, %777, %int32_1475, %int128_1476 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%2094 = torch.aten._unsafe_view %2092, %2093 : !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2094, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ> | |
%int-2_1477 = torch.constant.int -2 | |
%2095 = torch.aten.unsqueeze %1907, %int-2_1477 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2095, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 1, 128)> : !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ> | |
%int4_1478 = torch.constant.int 4 | |
%int8_1479 = torch.constant.int 8 | |
%int4_1480 = torch.constant.int 4 | |
%int128_1481 = torch.constant.int 128 | |
%2096 = torch.prim.ListConstruct %int4_1478, %777, %int8_1479, %int4_1480, %int128_1481 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%false_1482 = torch.constant.bool false | |
%2097 = torch.aten.expand %2095, %2096, %false_1482 : !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2097, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 4, 128)> : !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ> | |
%int0_1483 = torch.constant.int 0 | |
%2098 = torch.aten.clone %2097, %int0_1483 : !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2098, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 4, 128)> : !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ> | |
%int4_1484 = torch.constant.int 4 | |
%int32_1485 = torch.constant.int 32 | |
%int128_1486 = torch.constant.int 128 | |
%2099 = torch.prim.ListConstruct %int4_1484, %777, %int32_1485, %int128_1486 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%2100 = torch.aten._unsafe_view %2098, %2099 : !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2100, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ> | |
%int1_1487 = torch.constant.int 1 | |
%int2_1488 = torch.constant.int 2 | |
%2101 = torch.aten.transpose.int %1970, %int1_1487, %int2_1488 : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2101, [%773], affine_map<()[s0] -> (4, 32, s0 * 32, 128)> : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> | |
%int1_1489 = torch.constant.int 1 | |
%int2_1490 = torch.constant.int 2 | |
%2102 = torch.aten.transpose.int %2094, %int1_1489, %int2_1490 : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2102, [%773], affine_map<()[s0] -> (4, 32, s0 * 32, 128)> : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> | |
%int1_1491 = torch.constant.int 1 | |
%int2_1492 = torch.constant.int 2 | |
%2103 = torch.aten.transpose.int %2100, %int1_1491, %int2_1492 : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2103, [%773], affine_map<()[s0] -> (4, 32, s0 * 32, 128)> : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> | |
%int26_1493 = torch.constant.int 26 | |
%2104 = torch.prims.convert_element_type %2101, %int26_1493 : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2104, [%773], affine_map<()[s0] -> (4, 32, s0 * 32, 128)> : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> | |
%int26_1494 = torch.constant.int 26 | |
%2105 = torch.prims.convert_element_type %2102, %int26_1494 : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2105, [%773], affine_map<()[s0] -> (4, 32, s0 * 32, 128)> : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> | |
%int26_1495 = torch.constant.int 26 | |
%2106 = torch.prims.convert_element_type %2103, %int26_1495 : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2106, [%773], affine_map<()[s0] -> (4, 32, s0 * 32, 128)> : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> | |
%int26_1496 = torch.constant.int 26 | |
%2107 = torch.prims.convert_element_type %803, %int26_1496 : !torch.vtensor<[4,1,?,?],f32>, !torch.int -> !torch.vtensor<[4,1,?,?],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2107, [%773], affine_map<()[s0] -> (4, 1, s0 * 32, s0 * 32)> : !torch.vtensor<[4,1,?,?],f8E4M3FNUZ> | |
%int0_1497 = torch.constant.int 0 | |
%int0_1498 = torch.constant.int 0 | |
%2108 = torch.aten.select.int %2107, %int0_1497, %int0_1498 : !torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[1,?,?],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2108, [%773], affine_map<()[s0] -> (1, s0 * 32, s0 * 32)> : !torch.vtensor<[1,?,?],f8E4M3FNUZ> | |
%int0_1499 = torch.constant.int 0 | |
%int0_1500 = torch.constant.int 0 | |
%2109 = torch.aten.select.int %2108, %int0_1499, %int0_1500 : !torch.vtensor<[1,?,?],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[?,?],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2109, [%773], affine_map<()[s0] -> (s0 * 32, s0 * 32)> : !torch.vtensor<[?,?],f8E4M3FNUZ> | |
%int0_1501 = torch.constant.int 0 | |
%int0_1502 = torch.constant.int 0 | |
%int9223372036854775807_1503 = torch.constant.int 9223372036854775807 | |
%int1_1504 = torch.constant.int 1 | |
%2110 = torch.aten.slice.Tensor %2109, %int0_1501, %int0_1502, %int9223372036854775807_1503, %int1_1504 : !torch.vtensor<[?,?],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,?],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2110, [%773], affine_map<()[s0] -> (s0 * 32, s0 * 32)> : !torch.vtensor<[?,?],f8E4M3FNUZ> | |
%int1_1505 = torch.constant.int 1 | |
%int0_1506 = torch.constant.int 0 | |
%int9223372036854775807_1507 = torch.constant.int 9223372036854775807 | |
%int1_1508 = torch.constant.int 1 | |
%2111 = torch.aten.slice.Tensor %2110, %int1_1505, %int0_1506, %int9223372036854775807_1507, %int1_1508 : !torch.vtensor<[?,?],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,?],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2111, [%773], affine_map<()[s0] -> (s0 * 32, s0 * 32)> : !torch.vtensor<[?,?],f8E4M3FNUZ> | |
%none_1509 = torch.constant.none | |
%2112 = torch.aten.clone %83, %none_1509 : !torch.vtensor<[],f32>, !torch.none -> !torch.vtensor<[],f32> | |
%2113 = torch.aten.detach %2112 : !torch.vtensor<[],f32> -> !torch.vtensor<[],f32> | |
%2114 = torch.aten.detach %2113 : !torch.vtensor<[],f32> -> !torch.vtensor<[],f32> | |
%2115 = torch.aten.detach %2114 : !torch.vtensor<[],f32> -> !torch.vtensor<[],f32> | |
%2116 = torch_c.to_builtin_tensor %2104 : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> -> tensor<4x32x?x128xf8E4M3FNUZ> | |
%2117 = torch_c.to_builtin_tensor %2105 : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> -> tensor<4x32x?x128xf8E4M3FNUZ> | |
%2118 = torch_c.to_builtin_tensor %2106 : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> -> tensor<4x32x?x128xf8E4M3FNUZ> | |
%2119 = torch_c.to_builtin_tensor %2111 : !torch.vtensor<[?,?],f8E4M3FNUZ> -> tensor<?x?xf8E4M3FNUZ> | |
%2120 = torch_c.to_builtin_tensor %2115 : !torch.vtensor<[],f32> -> tensor<f32> | |
%2121 = util.call @sharktank_masked_flash_attention_4_32_128_128_f8E4M3FNUZ_f8E4M3FNUZ_f32_f32(%2116, %2117, %2118, %2120, %2119) : (tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<f32>, tensor<?x?xf8E4M3FNUZ>) -> tensor<4x32x?x128xf32> | |
%2122 = torch_c.from_builtin_tensor %2121 : tensor<4x32x?x128xf32> -> !torch.vtensor<[4,32,?,128],f32> | |
torch.bind_symbolic_shape %2122, [%773], affine_map<()[s0] -> (4, 32, s0 * 32, 128)> : !torch.vtensor<[4,32,?,128],f32> | |
%int1_1510 = torch.constant.int 1 | |
%int2_1511 = torch.constant.int 2 | |
%2123 = torch.aten.transpose.int %2122, %int1_1510, %int2_1511 : !torch.vtensor<[4,32,?,128],f32>, !torch.int, !torch.int -> !torch.vtensor<[4,?,32,128],f32> | |
torch.bind_symbolic_shape %2123, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f32> | |
%int0_1512 = torch.constant.int 0 | |
%2124 = torch.aten.clone %2123, %int0_1512 : !torch.vtensor<[4,?,32,128],f32>, !torch.int -> !torch.vtensor<[4,?,32,128],f32> | |
torch.bind_symbolic_shape %2124, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f32> | |
%int4_1513 = torch.constant.int 4 | |
%int4096_1514 = torch.constant.int 4096 | |
%2125 = torch.prim.ListConstruct %int4_1513, %777, %int4096_1514 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%2126 = torch.aten._unsafe_view %2124, %2125 : !torch.vtensor<[4,?,32,128],f32>, !torch.list<int> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %2126, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%2127 = torch.aten.div.Tensor %2126, %84 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %2127, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%float-2.400000e02_1515 = torch.constant.float -2.400000e+02 | |
%float2.400000e02_1516 = torch.constant.float 2.400000e+02 | |
%2128 = torch.aten.clamp %2127, %float-2.400000e02_1515, %float2.400000e02_1516 : !torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %2128, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int26_1517 = torch.constant.int 26 | |
%2129 = torch.prims.convert_element_type %2128, %int26_1517 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2129, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> | |
%int0_1518 = torch.constant.int 0 | |
%2130 = torch.aten.unsqueeze %85, %int0_1518 : !torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ> | |
%int4_1519 = torch.constant.int 4 | |
%int4096_1520 = torch.constant.int 4096 | |
%int4096_1521 = torch.constant.int 4096 | |
%2131 = torch.prim.ListConstruct %int4_1519, %int4096_1520, %int4096_1521 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%false_1522 = torch.constant.bool false | |
%2132 = torch.aten.expand %2130, %2131, %false_1522 : !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ> | |
%2133 = torch_c.to_builtin_tensor %2129 : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> -> tensor<4x?x4096xf8E4M3FNUZ> | |
%2134 = torch_c.to_builtin_tensor %2132 : !torch.vtensor<[4,4096,4096],f8E4M3FNUZ> -> tensor<4x4096x4096xf8E4M3FNUZ> | |
%2135 = util.call @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ_f32(%2133, %2134) : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x?x4096xf32> | |
%2136 = torch_c.from_builtin_tensor %2135 : tensor<4x?x4096xf32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %2136, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%2137 = torch.prim.ListConstruct : () -> !torch.list<int> | |
%2138 = torch.aten.permute %86, %2137 : !torch.vtensor<[],f32>, !torch.list<int> -> !torch.vtensor<[],f32> | |
%2139 = torch.aten.mul.Tensor %84, %2138 : !torch.vtensor<[],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[],f32> | |
%int6_1523 = torch.constant.int 6 | |
%2140 = torch.prims.convert_element_type %2136, %int6_1523 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %2140, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%2141 = torch.aten.mul.Tensor %2140, %2139 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %2141, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int1_1524 = torch.constant.int 1 | |
%2142 = torch.aten.add.Tensor %1852, %2141, %int1_1524 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %2142, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int6_1525 = torch.constant.int 6 | |
%2143 = torch.prims.convert_element_type %2142, %int6_1525 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %2143, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int2_1526 = torch.constant.int 2 | |
%2144 = torch.aten.pow.Tensor_Scalar %2143, %int2_1526 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %2144, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int-1_1527 = torch.constant.int -1 | |
%2145 = torch.prim.ListConstruct %int-1_1527 : (!torch.int) -> !torch.list<int> | |
%true_1528 = torch.constant.bool true | |
%none_1529 = torch.constant.none | |
%2146 = torch.aten.mean.dim %2144, %2145, %true_1528, %none_1529 : !torch.vtensor<[4,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[4,?,1],f32> | |
torch.bind_symbolic_shape %2146, [%773], affine_map<()[s0] -> (4, s0 * 32, 1)> : !torch.vtensor<[4,?,1],f32> | |
%float1.000000e-05_1530 = torch.constant.float 1.000000e-05 | |
%int1_1531 = torch.constant.int 1 | |
%2147 = torch.aten.add.Scalar %2146, %float1.000000e-05_1530, %int1_1531 : !torch.vtensor<[4,?,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[4,?,1],f32> | |
torch.bind_symbolic_shape %2147, [%773], affine_map<()[s0] -> (4, s0 * 32, 1)> : !torch.vtensor<[4,?,1],f32> | |
%2148 = torch.aten.rsqrt %2147 : !torch.vtensor<[4,?,1],f32> -> !torch.vtensor<[4,?,1],f32> | |
torch.bind_symbolic_shape %2148, [%773], affine_map<()[s0] -> (4, s0 * 32, 1)> : !torch.vtensor<[4,?,1],f32> | |
%2149 = torch.aten.mul.Tensor %2143, %2148 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,1],f32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %2149, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int6_1532 = torch.constant.int 6 | |
%2150 = torch.prims.convert_element_type %2149, %int6_1532 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %2150, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%2151 = torch.aten.mul.Tensor %87, %2150 : !torch.vtensor<[4096],bf16>, !torch.vtensor<[4,?,4096],f32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %2151, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int6_1533 = torch.constant.int 6 | |
%2152 = torch.prims.convert_element_type %2151, %int6_1533 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %2152, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%2153 = torch.aten.div.Tensor %2152, %88 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %2153, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%float-2.400000e02_1534 = torch.constant.float -2.400000e+02 | |
%float2.400000e02_1535 = torch.constant.float 2.400000e+02 | |
%2154 = torch.aten.clamp %2153, %float-2.400000e02_1534, %float2.400000e02_1535 : !torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %2154, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int26_1536 = torch.constant.int 26 | |
%2155 = torch.prims.convert_element_type %2154, %int26_1536 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2155, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> | |
%int0_1537 = torch.constant.int 0 | |
%2156 = torch.aten.unsqueeze %89, %int0_1537 : !torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ> | |
%int4_1538 = torch.constant.int 4 | |
%int14336_1539 = torch.constant.int 14336 | |
%int4096_1540 = torch.constant.int 4096 | |
%2157 = torch.prim.ListConstruct %int4_1538, %int14336_1539, %int4096_1540 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%false_1541 = torch.constant.bool false | |
%2158 = torch.aten.expand %2156, %2157, %false_1541 : !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ> | |
%2159 = torch_c.to_builtin_tensor %2155 : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> -> tensor<4x?x4096xf8E4M3FNUZ> | |
%2160 = torch_c.to_builtin_tensor %2158 : !torch.vtensor<[4,14336,4096],f8E4M3FNUZ> -> tensor<4x14336x4096xf8E4M3FNUZ> | |
%2161 = util.call @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ_f32(%2159, %2160) : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x?x14336xf32> | |
%2162 = torch_c.from_builtin_tensor %2161 : tensor<4x?x14336xf32> -> !torch.vtensor<[4,?,14336],f32> | |
torch.bind_symbolic_shape %2162, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32> | |
%2163 = torch.prim.ListConstruct : () -> !torch.list<int> | |
%2164 = torch.aten.permute %90, %2163 : !torch.vtensor<[],f32>, !torch.list<int> -> !torch.vtensor<[],f32> | |
%2165 = torch.aten.mul.Tensor %88, %2164 : !torch.vtensor<[],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[],f32> | |
%int6_1542 = torch.constant.int 6 | |
%2166 = torch.prims.convert_element_type %2162, %int6_1542 : !torch.vtensor<[4,?,14336],f32>, !torch.int -> !torch.vtensor<[4,?,14336],f32> | |
torch.bind_symbolic_shape %2166, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32> | |
%2167 = torch.aten.mul.Tensor %2166, %2165 : !torch.vtensor<[4,?,14336],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,14336],f32> | |
torch.bind_symbolic_shape %2167, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32> | |
%2168 = torch.aten.silu %2167 : !torch.vtensor<[4,?,14336],f32> -> !torch.vtensor<[4,?,14336],f32> | |
torch.bind_symbolic_shape %2168, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32> | |
%2169 = torch.aten.div.Tensor %2152, %91 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %2169, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%float-2.400000e02_1543 = torch.constant.float -2.400000e+02 | |
%float2.400000e02_1544 = torch.constant.float 2.400000e+02 | |
%2170 = torch.aten.clamp %2169, %float-2.400000e02_1543, %float2.400000e02_1544 : !torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %2170, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int26_1545 = torch.constant.int 26 | |
%2171 = torch.prims.convert_element_type %2170, %int26_1545 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2171, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> | |
%int0_1546 = torch.constant.int 0 | |
%2172 = torch.aten.unsqueeze %92, %int0_1546 : !torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ> | |
%int4_1547 = torch.constant.int 4 | |
%int14336_1548 = torch.constant.int 14336 | |
%int4096_1549 = torch.constant.int 4096 | |
%2173 = torch.prim.ListConstruct %int4_1547, %int14336_1548, %int4096_1549 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%false_1550 = torch.constant.bool false | |
%2174 = torch.aten.expand %2172, %2173, %false_1550 : !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ> | |
%2175 = torch_c.to_builtin_tensor %2171 : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> -> tensor<4x?x4096xf8E4M3FNUZ> | |
%2176 = torch_c.to_builtin_tensor %2174 : !torch.vtensor<[4,14336,4096],f8E4M3FNUZ> -> tensor<4x14336x4096xf8E4M3FNUZ> | |
%2177 = util.call @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ_f32(%2175, %2176) : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x?x14336xf32> | |
%2178 = torch_c.from_builtin_tensor %2177 : tensor<4x?x14336xf32> -> !torch.vtensor<[4,?,14336],f32> | |
torch.bind_symbolic_shape %2178, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32> | |
%2179 = torch.prim.ListConstruct : () -> !torch.list<int> | |
%2180 = torch.aten.permute %93, %2179 : !torch.vtensor<[],f32>, !torch.list<int> -> !torch.vtensor<[],f32> | |
%2181 = torch.aten.mul.Tensor %91, %2180 : !torch.vtensor<[],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[],f32> | |
%int6_1551 = torch.constant.int 6 | |
%2182 = torch.prims.convert_element_type %2178, %int6_1551 : !torch.vtensor<[4,?,14336],f32>, !torch.int -> !torch.vtensor<[4,?,14336],f32> | |
torch.bind_symbolic_shape %2182, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32> | |
%2183 = torch.aten.mul.Tensor %2182, %2181 : !torch.vtensor<[4,?,14336],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,14336],f32> | |
torch.bind_symbolic_shape %2183, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32> | |
%2184 = torch.aten.mul.Tensor %2168, %2183 : !torch.vtensor<[4,?,14336],f32>, !torch.vtensor<[4,?,14336],f32> -> !torch.vtensor<[4,?,14336],f32> | |
torch.bind_symbolic_shape %2184, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32> | |
%2185 = torch.aten.div.Tensor %2184, %94 : !torch.vtensor<[4,?,14336],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,14336],f32> | |
torch.bind_symbolic_shape %2185, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32> | |
%float-2.400000e02_1552 = torch.constant.float -2.400000e+02 | |
%float2.400000e02_1553 = torch.constant.float 2.400000e+02 | |
%2186 = torch.aten.clamp %2185, %float-2.400000e02_1552, %float2.400000e02_1553 : !torch.vtensor<[4,?,14336],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,14336],f32> | |
torch.bind_symbolic_shape %2186, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32> | |
%int26_1554 = torch.constant.int 26 | |
%2187 = torch.prims.convert_element_type %2186, %int26_1554 : !torch.vtensor<[4,?,14336],f32>, !torch.int -> !torch.vtensor<[4,?,14336],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2187, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f8E4M3FNUZ> | |
%int0_1555 = torch.constant.int 0 | |
%2188 = torch.aten.unsqueeze %95, %int0_1555 : !torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,4096,14336],f8E4M3FNUZ> | |
%int4_1556 = torch.constant.int 4 | |
%int4096_1557 = torch.constant.int 4096 | |
%int14336_1558 = torch.constant.int 14336 | |
%2189 = torch.prim.ListConstruct %int4_1556, %int4096_1557, %int14336_1558 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%false_1559 = torch.constant.bool false | |
%2190 = torch.aten.expand %2188, %2189, %false_1559 : !torch.vtensor<[1,4096,14336],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,4096,14336],f8E4M3FNUZ> | |
%2191 = torch_c.to_builtin_tensor %2187 : !torch.vtensor<[4,?,14336],f8E4M3FNUZ> -> tensor<4x?x14336xf8E4M3FNUZ> | |
%2192 = torch_c.to_builtin_tensor %2190 : !torch.vtensor<[4,4096,14336],f8E4M3FNUZ> -> tensor<4x4096x14336xf8E4M3FNUZ> | |
%2193 = util.call @sharktank_batch_matmul_transpose_b_L4xDx14336xf8E4M3FNUZ_R4x4096x14336xf8E4M3FNUZ_f32(%2191, %2192) : (tensor<4x?x14336xf8E4M3FNUZ>, tensor<4x4096x14336xf8E4M3FNUZ>) -> tensor<4x?x4096xf32> | |
%2194 = torch_c.from_builtin_tensor %2193 : tensor<4x?x4096xf32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %2194, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%2195 = torch.prim.ListConstruct : () -> !torch.list<int> | |
%2196 = torch.aten.permute %96, %2195 : !torch.vtensor<[],f32>, !torch.list<int> -> !torch.vtensor<[],f32> | |
%2197 = torch.aten.mul.Tensor %94, %2196 : !torch.vtensor<[],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[],f32> | |
%int6_1560 = torch.constant.int 6 | |
%2198 = torch.prims.convert_element_type %2194, %int6_1560 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %2198, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%2199 = torch.aten.mul.Tensor %2198, %2197 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %2199, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int1_1561 = torch.constant.int 1 | |
%2200 = torch.aten.add.Tensor %2142, %2199, %int1_1561 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %2200, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int6_1562 = torch.constant.int 6 | |
%2201 = torch.prims.convert_element_type %2200, %int6_1562 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %2201, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int2_1563 = torch.constant.int 2 | |
%2202 = torch.aten.pow.Tensor_Scalar %2201, %int2_1563 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %2202, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int-1_1564 = torch.constant.int -1 | |
%2203 = torch.prim.ListConstruct %int-1_1564 : (!torch.int) -> !torch.list<int> | |
%true_1565 = torch.constant.bool true | |
%none_1566 = torch.constant.none | |
%2204 = torch.aten.mean.dim %2202, %2203, %true_1565, %none_1566 : !torch.vtensor<[4,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[4,?,1],f32> | |
torch.bind_symbolic_shape %2204, [%773], affine_map<()[s0] -> (4, s0 * 32, 1)> : !torch.vtensor<[4,?,1],f32> | |
%float1.000000e-05_1567 = torch.constant.float 1.000000e-05 | |
%int1_1568 = torch.constant.int 1 | |
%2205 = torch.aten.add.Scalar %2204, %float1.000000e-05_1567, %int1_1568 : !torch.vtensor<[4,?,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[4,?,1],f32> | |
torch.bind_symbolic_shape %2205, [%773], affine_map<()[s0] -> (4, s0 * 32, 1)> : !torch.vtensor<[4,?,1],f32> | |
%2206 = torch.aten.rsqrt %2205 : !torch.vtensor<[4,?,1],f32> -> !torch.vtensor<[4,?,1],f32> | |
torch.bind_symbolic_shape %2206, [%773], affine_map<()[s0] -> (4, s0 * 32, 1)> : !torch.vtensor<[4,?,1],f32> | |
%2207 = torch.aten.mul.Tensor %2201, %2206 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,1],f32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %2207, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int6_1569 = torch.constant.int 6 | |
%2208 = torch.prims.convert_element_type %2207, %int6_1569 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %2208, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%2209 = torch.aten.mul.Tensor %97, %2208 : !torch.vtensor<[4096],bf16>, !torch.vtensor<[4,?,4096],f32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %2209, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int6_1570 = torch.constant.int 6 | |
%2210 = torch.prims.convert_element_type %2209, %int6_1570 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %2210, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%2211 = torch.aten.div.Tensor %2210, %98 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %2211, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%float-2.400000e02_1571 = torch.constant.float -2.400000e+02 | |
%float2.400000e02_1572 = torch.constant.float 2.400000e+02 | |
%2212 = torch.aten.clamp %2211, %float-2.400000e02_1571, %float2.400000e02_1572 : !torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %2212, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int26_1573 = torch.constant.int 26 | |
%2213 = torch.prims.convert_element_type %2212, %int26_1573 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2213, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> | |
%int0_1574 = torch.constant.int 0 | |
%2214 = torch.aten.unsqueeze %99, %int0_1574 : !torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ> | |
%int4_1575 = torch.constant.int 4 | |
%int4096_1576 = torch.constant.int 4096 | |
%int4096_1577 = torch.constant.int 4096 | |
%2215 = torch.prim.ListConstruct %int4_1575, %int4096_1576, %int4096_1577 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%false_1578 = torch.constant.bool false | |
%2216 = torch.aten.expand %2214, %2215, %false_1578 : !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ> | |
%2217 = torch_c.to_builtin_tensor %2213 : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> -> tensor<4x?x4096xf8E4M3FNUZ> | |
%2218 = torch_c.to_builtin_tensor %2216 : !torch.vtensor<[4,4096,4096],f8E4M3FNUZ> -> tensor<4x4096x4096xf8E4M3FNUZ> | |
%2219 = util.call @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ_f32(%2217, %2218) : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x?x4096xf32> | |
%2220 = torch_c.from_builtin_tensor %2219 : tensor<4x?x4096xf32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %2220, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%2221 = torch.aten.div.Tensor %2220, %100 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %2221, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%float-2.400000e02_1579 = torch.constant.float -2.400000e+02 | |
%float2.400000e02_1580 = torch.constant.float 2.400000e+02 | |
%2222 = torch.aten.clamp %2221, %float-2.400000e02_1579, %float2.400000e02_1580 : !torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %2222, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int26_1581 = torch.constant.int 26 | |
%2223 = torch.prims.convert_element_type %2222, %int26_1581 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2223, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> | |
%2224 = torch.aten.div.Tensor %2210, %101 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %2224, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%float-2.400000e02_1582 = torch.constant.float -2.400000e+02 | |
%float2.400000e02_1583 = torch.constant.float 2.400000e+02 | |
%2225 = torch.aten.clamp %2224, %float-2.400000e02_1582, %float2.400000e02_1583 : !torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %2225, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int26_1584 = torch.constant.int 26 | |
%2226 = torch.prims.convert_element_type %2225, %int26_1584 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2226, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> | |
%int0_1585 = torch.constant.int 0 | |
%2227 = torch.aten.unsqueeze %102, %int0_1585 : !torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ> | |
%int4_1586 = torch.constant.int 4 | |
%int1024_1587 = torch.constant.int 1024 | |
%int4096_1588 = torch.constant.int 4096 | |
%2228 = torch.prim.ListConstruct %int4_1586, %int1024_1587, %int4096_1588 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%false_1589 = torch.constant.bool false | |
%2229 = torch.aten.expand %2227, %2228, %false_1589 : !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ> | |
%2230 = torch_c.to_builtin_tensor %2226 : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> -> tensor<4x?x4096xf8E4M3FNUZ> | |
%2231 = torch_c.to_builtin_tensor %2229 : !torch.vtensor<[4,1024,4096],f8E4M3FNUZ> -> tensor<4x1024x4096xf8E4M3FNUZ> | |
%2232 = util.call @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ_f32(%2230, %2231) : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x?x1024xf32> | |
%2233 = torch_c.from_builtin_tensor %2232 : tensor<4x?x1024xf32> -> !torch.vtensor<[4,?,1024],f32> | |
torch.bind_symbolic_shape %2233, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f32> | |
%2234 = torch.aten.div.Tensor %2233, %103 : !torch.vtensor<[4,?,1024],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,1024],f32> | |
torch.bind_symbolic_shape %2234, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f32> | |
%float-2.400000e02_1590 = torch.constant.float -2.400000e+02 | |
%float2.400000e02_1591 = torch.constant.float 2.400000e+02 | |
%2235 = torch.aten.clamp %2234, %float-2.400000e02_1590, %float2.400000e02_1591 : !torch.vtensor<[4,?,1024],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,1024],f32> | |
torch.bind_symbolic_shape %2235, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f32> | |
%int26_1592 = torch.constant.int 26 | |
%2236 = torch.prims.convert_element_type %2235, %int26_1592 : !torch.vtensor<[4,?,1024],f32>, !torch.int -> !torch.vtensor<[4,?,1024],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2236, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f8E4M3FNUZ> | |
%2237 = torch.aten.div.Tensor %2210, %104 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %2237, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%float-2.400000e02_1593 = torch.constant.float -2.400000e+02 | |
%float2.400000e02_1594 = torch.constant.float 2.400000e+02 | |
%2238 = torch.aten.clamp %2237, %float-2.400000e02_1593, %float2.400000e02_1594 : !torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %2238, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int26_1595 = torch.constant.int 26 | |
%2239 = torch.prims.convert_element_type %2238, %int26_1595 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2239, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> | |
%int0_1596 = torch.constant.int 0 | |
%2240 = torch.aten.unsqueeze %105, %int0_1596 : !torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ> | |
%int4_1597 = torch.constant.int 4 | |
%int1024_1598 = torch.constant.int 1024 | |
%int4096_1599 = torch.constant.int 4096 | |
%2241 = torch.prim.ListConstruct %int4_1597, %int1024_1598, %int4096_1599 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%false_1600 = torch.constant.bool false | |
%2242 = torch.aten.expand %2240, %2241, %false_1600 : !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ> | |
%2243 = torch_c.to_builtin_tensor %2239 : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> -> tensor<4x?x4096xf8E4M3FNUZ> | |
%2244 = torch_c.to_builtin_tensor %2242 : !torch.vtensor<[4,1024,4096],f8E4M3FNUZ> -> tensor<4x1024x4096xf8E4M3FNUZ> | |
%2245 = util.call @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ_f32(%2243, %2244) : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x?x1024xf32> | |
%2246 = torch_c.from_builtin_tensor %2245 : tensor<4x?x1024xf32> -> !torch.vtensor<[4,?,1024],f32> | |
torch.bind_symbolic_shape %2246, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f32> | |
%2247 = torch.aten.div.Tensor %2246, %106 : !torch.vtensor<[4,?,1024],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,1024],f32> | |
torch.bind_symbolic_shape %2247, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f32> | |
%float-2.400000e02_1601 = torch.constant.float -2.400000e+02 | |
%float2.400000e02_1602 = torch.constant.float 2.400000e+02 | |
%2248 = torch.aten.clamp %2247, %float-2.400000e02_1601, %float2.400000e02_1602 : !torch.vtensor<[4,?,1024],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,1024],f32> | |
torch.bind_symbolic_shape %2248, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f32> | |
%int26_1603 = torch.constant.int 26 | |
%2249 = torch.prims.convert_element_type %2248, %int26_1603 : !torch.vtensor<[4,?,1024],f32>, !torch.int -> !torch.vtensor<[4,?,1024],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2249, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f8E4M3FNUZ> | |
%int4_1604 = torch.constant.int 4 | |
%int32_1605 = torch.constant.int 32 | |
%int128_1606 = torch.constant.int 128 | |
%2250 = torch.prim.ListConstruct %int4_1604, %777, %int32_1605, %int128_1606 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%2251 = torch.aten.view %2223, %2250 : !torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2251, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ> | |
%int4_1607 = torch.constant.int 4 | |
%int8_1608 = torch.constant.int 8 | |
%int128_1609 = torch.constant.int 128 | |
%2252 = torch.prim.ListConstruct %int4_1607, %777, %int8_1608, %int128_1609 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%2253 = torch.aten.view %2236, %2252 : !torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2253, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 128)> : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ> | |
%int4_1610 = torch.constant.int 4 | |
%int8_1611 = torch.constant.int 8 | |
%int128_1612 = torch.constant.int 128 | |
%2254 = torch.prim.ListConstruct %int4_1610, %777, %int8_1611, %int128_1612 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%2255 = torch.aten.view %2249, %2254 : !torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2255, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 128)> : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ> | |
%int131072_1613 = torch.constant.int 131072 | |
%none_1614 = torch.constant.none | |
%none_1615 = torch.constant.none | |
%cpu_1616 = torch.constant.device "cpu" | |
%false_1617 = torch.constant.bool false | |
%2256 = torch.aten.arange %int131072_1613, %none_1614, %none_1615, %cpu_1616, %false_1617 : !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[131072],si64> | |
%int0_1618 = torch.constant.int 0 | |
%int128_1619 = torch.constant.int 128 | |
%int2_1620 = torch.constant.int 2 | |
%int4_1621 = torch.constant.int 4 | |
%none_1622 = torch.constant.none | |
%cpu_1623 = torch.constant.device "cpu" | |
%false_1624 = torch.constant.bool false | |
%2257 = torch.aten.arange.start_step %int0_1618, %int128_1619, %int2_1620, %int4_1621, %none_1622, %cpu_1623, %false_1624 : !torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[64],si64> | |
%int6_1625 = torch.constant.int 6 | |
%2258 = torch.prims.convert_element_type %2257, %int6_1625 : !torch.vtensor<[64],si64>, !torch.int -> !torch.vtensor<[64],f32> | |
%int128_1626 = torch.constant.int 128 | |
%2259 = torch.aten.div.Scalar %2258, %int128_1626 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32> | |
%float5.000000e05_1627 = torch.constant.float 5.000000e+05 | |
%2260 = torch.aten.pow.Scalar %float5.000000e05_1627, %2259 : !torch.float, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32> | |
%2261 = torch.aten.reciprocal %2260 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32> | |
%float1.000000e00_1628 = torch.constant.float 1.000000e+00 | |
%2262 = torch.aten.mul.Scalar %2261, %float1.000000e00_1628 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],f32> | |
%2263 = torch.aten.reciprocal %2262 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32> | |
%float6.283190e00_1629 = torch.constant.float 6.2831853071795862 | |
%2264 = torch.aten.mul.Scalar %2263, %float6.283190e00_1629 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],f32> | |
%float8.192000e03_1630 = torch.constant.float 8.192000e+03 | |
%2265 = torch.aten.gt.Scalar %2264, %float8.192000e03_1630 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1> | |
%int8_1631 = torch.constant.int 8 | |
%2266 = torch.aten.div.Scalar %2262, %int8_1631 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32> | |
%2267 = torch.aten.where.self %2265, %2266, %2262 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32> | |
%2268 = torch.aten.reciprocal %2264 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32> | |
%int8192_1632 = torch.constant.int 8192 | |
%2269 = torch.aten.mul.Scalar %2268, %int8192_1632 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32> | |
%int1_1633 = torch.constant.int 1 | |
%int1_1634 = torch.constant.int 1 | |
%2270 = torch.aten.sub.Scalar %2269, %int1_1633, %int1_1634 : !torch.vtensor<[64],f32>, !torch.int, !torch.int -> !torch.vtensor<[64],f32> | |
%int3_1635 = torch.constant.int 3 | |
%2271 = torch.aten.div.Scalar %2270, %int3_1635 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32> | |
%int1_1636 = torch.constant.int 1 | |
%int1_1637 = torch.constant.int 1 | |
%2272 = torch.aten.rsub.Scalar %2271, %int1_1636, %int1_1637 : !torch.vtensor<[64],f32>, !torch.int, !torch.int -> !torch.vtensor<[64],f32> | |
%2273 = torch.aten.mul.Tensor %2272, %2267 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32> | |
%int8_1638 = torch.constant.int 8 | |
%2274 = torch.aten.div.Scalar %2273, %int8_1638 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32> | |
%2275 = torch.aten.mul.Tensor %2271, %2267 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32> | |
%int1_1639 = torch.constant.int 1 | |
%2276 = torch.aten.add.Tensor %2274, %2275, %int1_1639 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32> | |
%float2.048000e03_1640 = torch.constant.float 2.048000e+03 | |
%2277 = torch.aten.lt.Scalar %2264, %float2.048000e03_1640 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1> | |
%2278 = torch.aten.bitwise_not %2277 : !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1> | |
%float8.192000e03_1641 = torch.constant.float 8.192000e+03 | |
%2279 = torch.aten.gt.Scalar %2264, %float8.192000e03_1641 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1> | |
%2280 = torch.aten.bitwise_not %2279 : !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1> | |
%2281 = torch.aten.mul.Tensor %2278, %2280 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1> | |
%2282 = torch.aten.where.self %2281, %2276, %2267 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32> | |
%2283 = torch.prim.ListConstruct %2282, %2282 : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor> | |
%int-1_1642 = torch.constant.int -1 | |
%2284 = torch.aten.cat %2283, %int-1_1642 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[128],f32> | |
%int6_1643 = torch.constant.int 6 | |
%2285 = torch.prims.convert_element_type %2284, %int6_1643 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[128],f32> | |
%int1_1644 = torch.constant.int 1 | |
%2286 = torch.aten.unsqueeze %2256, %int1_1644 : !torch.vtensor<[131072],si64>, !torch.int -> !torch.vtensor<[131072,1],si64> | |
%int6_1645 = torch.constant.int 6 | |
%2287 = torch.prims.convert_element_type %2286, %int6_1645 : !torch.vtensor<[131072,1],si64>, !torch.int -> !torch.vtensor<[131072,1],f32> | |
%int0_1646 = torch.constant.int 0 | |
%2288 = torch.aten.unsqueeze %2285, %int0_1646 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[1,128],f32> | |
%int6_1647 = torch.constant.int 6 | |
%2289 = torch.prims.convert_element_type %2288, %int6_1647 : !torch.vtensor<[1,128],f32>, !torch.int -> !torch.vtensor<[1,128],f32> | |
%2290 = torch.aten.mul.Tensor %2287, %2289 : !torch.vtensor<[131072,1],f32>, !torch.vtensor<[1,128],f32> -> !torch.vtensor<[131072,128],f32> | |
%2291 = torch.aten.cos %2290 : !torch.vtensor<[131072,128],f32> -> !torch.vtensor<[131072,128],f32> | |
%int15_1648 = torch.constant.int 15 | |
%2292 = torch.prims.convert_element_type %2291, %int15_1648 : !torch.vtensor<[131072,128],f32>, !torch.int -> !torch.vtensor<[131072,128],bf16> | |
%2293 = torch.aten.sin %2290 : !torch.vtensor<[131072,128],f32> -> !torch.vtensor<[131072,128],f32> | |
%int15_1649 = torch.constant.int 15 | |
%2294 = torch.prims.convert_element_type %2293, %int15_1649 : !torch.vtensor<[131072,128],f32>, !torch.int -> !torch.vtensor<[131072,128],bf16> | |
%int0_1650 = torch.constant.int 0 | |
%int0_1651 = torch.constant.int 0 | |
%int1_1652 = torch.constant.int 1 | |
%2295 = torch.aten.slice.Tensor %2292, %int0_1650, %int0_1651, %777, %int1_1652 : !torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16> | |
torch.bind_symbolic_shape %2295, [%773], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16> | |
%int1_1653 = torch.constant.int 1 | |
%int0_1654 = torch.constant.int 0 | |
%int9223372036854775807_1655 = torch.constant.int 9223372036854775807 | |
%int1_1656 = torch.constant.int 1 | |
%2296 = torch.aten.slice.Tensor %2295, %int1_1653, %int0_1654, %int9223372036854775807_1655, %int1_1656 : !torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16> | |
torch.bind_symbolic_shape %2296, [%773], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16> | |
%int0_1657 = torch.constant.int 0 | |
%int0_1658 = torch.constant.int 0 | |
%int1_1659 = torch.constant.int 1 | |
%2297 = torch.aten.slice.Tensor %2294, %int0_1657, %int0_1658, %777, %int1_1659 : !torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16> | |
torch.bind_symbolic_shape %2297, [%773], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16> | |
%int1_1660 = torch.constant.int 1 | |
%int0_1661 = torch.constant.int 0 | |
%int9223372036854775807_1662 = torch.constant.int 9223372036854775807 | |
%int1_1663 = torch.constant.int 1 | |
%2298 = torch.aten.slice.Tensor %2297, %int1_1660, %int0_1661, %int9223372036854775807_1662, %int1_1663 : !torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16> | |
torch.bind_symbolic_shape %2298, [%773], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16> | |
%int0_1664 = torch.constant.int 0 | |
%2299 = torch.aten.unsqueeze %2296, %int0_1664 : !torch.vtensor<[?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,128],bf16> | |
torch.bind_symbolic_shape %2299, [%773], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16> | |
%int1_1665 = torch.constant.int 1 | |
%int0_1666 = torch.constant.int 0 | |
%int9223372036854775807_1667 = torch.constant.int 9223372036854775807 | |
%int1_1668 = torch.constant.int 1 | |
%2300 = torch.aten.slice.Tensor %2299, %int1_1665, %int0_1666, %int9223372036854775807_1667, %int1_1668 : !torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],bf16> | |
torch.bind_symbolic_shape %2300, [%773], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16> | |
%int2_1669 = torch.constant.int 2 | |
%2301 = torch.aten.unsqueeze %2300, %int2_1669 : !torch.vtensor<[1,?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,1,128],bf16> | |
torch.bind_symbolic_shape %2301, [%773], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16> | |
%int3_1670 = torch.constant.int 3 | |
%int0_1671 = torch.constant.int 0 | |
%int9223372036854775807_1672 = torch.constant.int 9223372036854775807 | |
%int1_1673 = torch.constant.int 1 | |
%2302 = torch.aten.slice.Tensor %2301, %int3_1670, %int0_1671, %int9223372036854775807_1672, %int1_1673 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,1,128],bf16> | |
torch.bind_symbolic_shape %2302, [%773], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16> | |
%int4_1674 = torch.constant.int 4 | |
%int1_1675 = torch.constant.int 1 | |
%int1_1676 = torch.constant.int 1 | |
%int1_1677 = torch.constant.int 1 | |
%2303 = torch.prim.ListConstruct %int4_1674, %int1_1675, %int1_1676, %int1_1677 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%2304 = torch.aten.repeat %2302, %2303 : !torch.vtensor<[1,?,1,128],bf16>, !torch.list<int> -> !torch.vtensor<[4,?,1,128],bf16> | |
torch.bind_symbolic_shape %2304, [%773], affine_map<()[s0] -> (4, s0 * 32, 1, 128)> : !torch.vtensor<[4,?,1,128],bf16> | |
%int0_1678 = torch.constant.int 0 | |
%2305 = torch.aten.unsqueeze %2298, %int0_1678 : !torch.vtensor<[?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,128],bf16> | |
torch.bind_symbolic_shape %2305, [%773], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16> | |
%int1_1679 = torch.constant.int 1 | |
%int0_1680 = torch.constant.int 0 | |
%int9223372036854775807_1681 = torch.constant.int 9223372036854775807 | |
%int1_1682 = torch.constant.int 1 | |
%2306 = torch.aten.slice.Tensor %2305, %int1_1679, %int0_1680, %int9223372036854775807_1681, %int1_1682 : !torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],bf16> | |
torch.bind_symbolic_shape %2306, [%773], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16> | |
%int2_1683 = torch.constant.int 2 | |
%2307 = torch.aten.unsqueeze %2306, %int2_1683 : !torch.vtensor<[1,?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,1,128],bf16> | |
torch.bind_symbolic_shape %2307, [%773], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16> | |
%int3_1684 = torch.constant.int 3 | |
%int0_1685 = torch.constant.int 0 | |
%int9223372036854775807_1686 = torch.constant.int 9223372036854775807 | |
%int1_1687 = torch.constant.int 1 | |
%2308 = torch.aten.slice.Tensor %2307, %int3_1684, %int0_1685, %int9223372036854775807_1686, %int1_1687 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,1,128],bf16> | |
torch.bind_symbolic_shape %2308, [%773], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16> | |
%int4_1688 = torch.constant.int 4 | |
%int1_1689 = torch.constant.int 1 | |
%int1_1690 = torch.constant.int 1 | |
%int1_1691 = torch.constant.int 1 | |
%2309 = torch.prim.ListConstruct %int4_1688, %int1_1689, %int1_1690, %int1_1691 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%2310 = torch.aten.repeat %2308, %2309 : !torch.vtensor<[1,?,1,128],bf16>, !torch.list<int> -> !torch.vtensor<[4,?,1,128],bf16> | |
torch.bind_symbolic_shape %2310, [%773], affine_map<()[s0] -> (4, s0 * 32, 1, 128)> : !torch.vtensor<[4,?,1,128],bf16> | |
%2311 = torch.aten.mul.Tensor %2251, %2304 : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16> -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2311, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ> | |
%int3_1692 = torch.constant.int 3 | |
%int0_1693 = torch.constant.int 0 | |
%int64_1694 = torch.constant.int 64 | |
%int1_1695 = torch.constant.int 1 | |
%2312 = torch.aten.slice.Tensor %2251, %int3_1692, %int0_1693, %int64_1694, %int1_1695 : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2312, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 64)> : !torch.vtensor<[4,?,32,64],f8E4M3FNUZ> | |
%int3_1696 = torch.constant.int 3 | |
%int64_1697 = torch.constant.int 64 | |
%int9223372036854775807_1698 = torch.constant.int 9223372036854775807 | |
%int1_1699 = torch.constant.int 1 | |
%2313 = torch.aten.slice.Tensor %2251, %int3_1696, %int64_1697, %int9223372036854775807_1698, %int1_1699 : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2313, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 64)> : !torch.vtensor<[4,?,32,64],f8E4M3FNUZ> | |
%2314 = torch.aten.neg %2313 : !torch.vtensor<[4,?,32,64],f8E4M3FNUZ> -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2314, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 64)> : !torch.vtensor<[4,?,32,64],f8E4M3FNUZ> | |
%2315 = torch.prim.ListConstruct %2314, %2312 : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>) -> !torch.list<vtensor> | |
%int-1_1700 = torch.constant.int -1 | |
%2316 = torch.aten.cat %2315, %int-1_1700 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2316, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ> | |
%2317 = torch.aten.mul.Tensor %2316, %2310 : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16> -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2317, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ> | |
%int1_1701 = torch.constant.int 1 | |
%2318 = torch.aten.add.Tensor %2311, %2317, %int1_1701 : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2318, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ> | |
%int131072_1702 = torch.constant.int 131072 | |
%none_1703 = torch.constant.none | |
%none_1704 = torch.constant.none | |
%cpu_1705 = torch.constant.device "cpu" | |
%false_1706 = torch.constant.bool false | |
%2319 = torch.aten.arange %int131072_1702, %none_1703, %none_1704, %cpu_1705, %false_1706 : !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[131072],si64> | |
%int0_1707 = torch.constant.int 0 | |
%int128_1708 = torch.constant.int 128 | |
%int2_1709 = torch.constant.int 2 | |
%int4_1710 = torch.constant.int 4 | |
%none_1711 = torch.constant.none | |
%cpu_1712 = torch.constant.device "cpu" | |
%false_1713 = torch.constant.bool false | |
%2320 = torch.aten.arange.start_step %int0_1707, %int128_1708, %int2_1709, %int4_1710, %none_1711, %cpu_1712, %false_1713 : !torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[64],si64> | |
%int6_1714 = torch.constant.int 6 | |
%2321 = torch.prims.convert_element_type %2320, %int6_1714 : !torch.vtensor<[64],si64>, !torch.int -> !torch.vtensor<[64],f32> | |
%int128_1715 = torch.constant.int 128 | |
%2322 = torch.aten.div.Scalar %2321, %int128_1715 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32> | |
%float5.000000e05_1716 = torch.constant.float 5.000000e+05 | |
%2323 = torch.aten.pow.Scalar %float5.000000e05_1716, %2322 : !torch.float, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32> | |
%2324 = torch.aten.reciprocal %2323 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32> | |
%float1.000000e00_1717 = torch.constant.float 1.000000e+00 | |
%2325 = torch.aten.mul.Scalar %2324, %float1.000000e00_1717 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],f32> | |
%2326 = torch.aten.reciprocal %2325 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32> | |
%float6.283190e00_1718 = torch.constant.float 6.2831853071795862 | |
%2327 = torch.aten.mul.Scalar %2326, %float6.283190e00_1718 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],f32> | |
%float8.192000e03_1719 = torch.constant.float 8.192000e+03 | |
%2328 = torch.aten.gt.Scalar %2327, %float8.192000e03_1719 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1> | |
%int8_1720 = torch.constant.int 8 | |
%2329 = torch.aten.div.Scalar %2325, %int8_1720 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32> | |
%2330 = torch.aten.where.self %2328, %2329, %2325 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32> | |
%2331 = torch.aten.reciprocal %2327 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32> | |
%int8192_1721 = torch.constant.int 8192 | |
%2332 = torch.aten.mul.Scalar %2331, %int8192_1721 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32> | |
%int1_1722 = torch.constant.int 1 | |
%int1_1723 = torch.constant.int 1 | |
%2333 = torch.aten.sub.Scalar %2332, %int1_1722, %int1_1723 : !torch.vtensor<[64],f32>, !torch.int, !torch.int -> !torch.vtensor<[64],f32> | |
%int3_1724 = torch.constant.int 3 | |
%2334 = torch.aten.div.Scalar %2333, %int3_1724 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32> | |
%int1_1725 = torch.constant.int 1 | |
%int1_1726 = torch.constant.int 1 | |
%2335 = torch.aten.rsub.Scalar %2334, %int1_1725, %int1_1726 : !torch.vtensor<[64],f32>, !torch.int, !torch.int -> !torch.vtensor<[64],f32> | |
%2336 = torch.aten.mul.Tensor %2335, %2330 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32> | |
%int8_1727 = torch.constant.int 8 | |
%2337 = torch.aten.div.Scalar %2336, %int8_1727 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32> | |
%2338 = torch.aten.mul.Tensor %2334, %2330 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32> | |
%int1_1728 = torch.constant.int 1 | |
%2339 = torch.aten.add.Tensor %2337, %2338, %int1_1728 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32> | |
%float2.048000e03_1729 = torch.constant.float 2.048000e+03 | |
%2340 = torch.aten.lt.Scalar %2327, %float2.048000e03_1729 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1> | |
%2341 = torch.aten.bitwise_not %2340 : !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1> | |
%float8.192000e03_1730 = torch.constant.float 8.192000e+03 | |
%2342 = torch.aten.gt.Scalar %2327, %float8.192000e03_1730 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1> | |
%2343 = torch.aten.bitwise_not %2342 : !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1> | |
%2344 = torch.aten.mul.Tensor %2341, %2343 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1> | |
%2345 = torch.aten.where.self %2344, %2339, %2330 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32> | |
%2346 = torch.prim.ListConstruct %2345, %2345 : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor> | |
%int-1_1731 = torch.constant.int -1 | |
%2347 = torch.aten.cat %2346, %int-1_1731 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[128],f32> | |
%int6_1732 = torch.constant.int 6 | |
%2348 = torch.prims.convert_element_type %2347, %int6_1732 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[128],f32> | |
%int1_1733 = torch.constant.int 1 | |
%2349 = torch.aten.unsqueeze %2319, %int1_1733 : !torch.vtensor<[131072],si64>, !torch.int -> !torch.vtensor<[131072,1],si64> | |
%int6_1734 = torch.constant.int 6 | |
%2350 = torch.prims.convert_element_type %2349, %int6_1734 : !torch.vtensor<[131072,1],si64>, !torch.int -> !torch.vtensor<[131072,1],f32> | |
%int0_1735 = torch.constant.int 0 | |
%2351 = torch.aten.unsqueeze %2348, %int0_1735 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[1,128],f32> | |
%int6_1736 = torch.constant.int 6 | |
%2352 = torch.prims.convert_element_type %2351, %int6_1736 : !torch.vtensor<[1,128],f32>, !torch.int -> !torch.vtensor<[1,128],f32> | |
%2353 = torch.aten.mul.Tensor %2350, %2352 : !torch.vtensor<[131072,1],f32>, !torch.vtensor<[1,128],f32> -> !torch.vtensor<[131072,128],f32> | |
%2354 = torch.aten.cos %2353 : !torch.vtensor<[131072,128],f32> -> !torch.vtensor<[131072,128],f32> | |
%int15_1737 = torch.constant.int 15 | |
%2355 = torch.prims.convert_element_type %2354, %int15_1737 : !torch.vtensor<[131072,128],f32>, !torch.int -> !torch.vtensor<[131072,128],bf16> | |
%2356 = torch.aten.sin %2353 : !torch.vtensor<[131072,128],f32> -> !torch.vtensor<[131072,128],f32> | |
%int15_1738 = torch.constant.int 15 | |
%2357 = torch.prims.convert_element_type %2356, %int15_1738 : !torch.vtensor<[131072,128],f32>, !torch.int -> !torch.vtensor<[131072,128],bf16> | |
%int0_1739 = torch.constant.int 0 | |
%int0_1740 = torch.constant.int 0 | |
%int1_1741 = torch.constant.int 1 | |
%2358 = torch.aten.slice.Tensor %2355, %int0_1739, %int0_1740, %777, %int1_1741 : !torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16> | |
torch.bind_symbolic_shape %2358, [%773], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16> | |
%int1_1742 = torch.constant.int 1 | |
%int0_1743 = torch.constant.int 0 | |
%int9223372036854775807_1744 = torch.constant.int 9223372036854775807 | |
%int1_1745 = torch.constant.int 1 | |
%2359 = torch.aten.slice.Tensor %2358, %int1_1742, %int0_1743, %int9223372036854775807_1744, %int1_1745 : !torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16> | |
torch.bind_symbolic_shape %2359, [%773], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16> | |
%int0_1746 = torch.constant.int 0 | |
%int0_1747 = torch.constant.int 0 | |
%int1_1748 = torch.constant.int 1 | |
%2360 = torch.aten.slice.Tensor %2357, %int0_1746, %int0_1747, %777, %int1_1748 : !torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16> | |
torch.bind_symbolic_shape %2360, [%773], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16> | |
%int1_1749 = torch.constant.int 1 | |
%int0_1750 = torch.constant.int 0 | |
%int9223372036854775807_1751 = torch.constant.int 9223372036854775807 | |
%int1_1752 = torch.constant.int 1 | |
%2361 = torch.aten.slice.Tensor %2360, %int1_1749, %int0_1750, %int9223372036854775807_1751, %int1_1752 : !torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16> | |
torch.bind_symbolic_shape %2361, [%773], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16> | |
%int0_1753 = torch.constant.int 0 | |
%2362 = torch.aten.unsqueeze %2359, %int0_1753 : !torch.vtensor<[?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,128],bf16> | |
torch.bind_symbolic_shape %2362, [%773], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16> | |
%int1_1754 = torch.constant.int 1 | |
%int0_1755 = torch.constant.int 0 | |
%int9223372036854775807_1756 = torch.constant.int 9223372036854775807 | |
%int1_1757 = torch.constant.int 1 | |
%2363 = torch.aten.slice.Tensor %2362, %int1_1754, %int0_1755, %int9223372036854775807_1756, %int1_1757 : !torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],bf16> | |
torch.bind_symbolic_shape %2363, [%773], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16> | |
%int2_1758 = torch.constant.int 2 | |
%2364 = torch.aten.unsqueeze %2363, %int2_1758 : !torch.vtensor<[1,?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,1,128],bf16> | |
torch.bind_symbolic_shape %2364, [%773], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16> | |
%int3_1759 = torch.constant.int 3 | |
%int0_1760 = torch.constant.int 0 | |
%int9223372036854775807_1761 = torch.constant.int 9223372036854775807 | |
%int1_1762 = torch.constant.int 1 | |
%2365 = torch.aten.slice.Tensor %2364, %int3_1759, %int0_1760, %int9223372036854775807_1761, %int1_1762 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,1,128],bf16> | |
torch.bind_symbolic_shape %2365, [%773], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16> | |
%int4_1763 = torch.constant.int 4 | |
%int1_1764 = torch.constant.int 1 | |
%int1_1765 = torch.constant.int 1 | |
%int1_1766 = torch.constant.int 1 | |
%2366 = torch.prim.ListConstruct %int4_1763, %int1_1764, %int1_1765, %int1_1766 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%2367 = torch.aten.repeat %2365, %2366 : !torch.vtensor<[1,?,1,128],bf16>, !torch.list<int> -> !torch.vtensor<[4,?,1,128],bf16> | |
torch.bind_symbolic_shape %2367, [%773], affine_map<()[s0] -> (4, s0 * 32, 1, 128)> : !torch.vtensor<[4,?,1,128],bf16> | |
%int0_1767 = torch.constant.int 0 | |
%2368 = torch.aten.unsqueeze %2361, %int0_1767 : !torch.vtensor<[?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,128],bf16> | |
torch.bind_symbolic_shape %2368, [%773], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16> | |
%int1_1768 = torch.constant.int 1 | |
%int0_1769 = torch.constant.int 0 | |
%int9223372036854775807_1770 = torch.constant.int 9223372036854775807 | |
%int1_1771 = torch.constant.int 1 | |
%2369 = torch.aten.slice.Tensor %2368, %int1_1768, %int0_1769, %int9223372036854775807_1770, %int1_1771 : !torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],bf16> | |
torch.bind_symbolic_shape %2369, [%773], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16> | |
%int2_1772 = torch.constant.int 2 | |
%2370 = torch.aten.unsqueeze %2369, %int2_1772 : !torch.vtensor<[1,?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,1,128],bf16> | |
torch.bind_symbolic_shape %2370, [%773], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16> | |
%int3_1773 = torch.constant.int 3 | |
%int0_1774 = torch.constant.int 0 | |
%int9223372036854775807_1775 = torch.constant.int 9223372036854775807 | |
%int1_1776 = torch.constant.int 1 | |
%2371 = torch.aten.slice.Tensor %2370, %int3_1773, %int0_1774, %int9223372036854775807_1775, %int1_1776 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,1,128],bf16> | |
torch.bind_symbolic_shape %2371, [%773], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16> | |
%int4_1777 = torch.constant.int 4 | |
%int1_1778 = torch.constant.int 1 | |
%int1_1779 = torch.constant.int 1 | |
%int1_1780 = torch.constant.int 1 | |
%2372 = torch.prim.ListConstruct %int4_1777, %int1_1778, %int1_1779, %int1_1780 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%2373 = torch.aten.repeat %2371, %2372 : !torch.vtensor<[1,?,1,128],bf16>, !torch.list<int> -> !torch.vtensor<[4,?,1,128],bf16> | |
torch.bind_symbolic_shape %2373, [%773], affine_map<()[s0] -> (4, s0 * 32, 1, 128)> : !torch.vtensor<[4,?,1,128],bf16> | |
%2374 = torch.aten.mul.Tensor %2253, %2367 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16> -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2374, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 128)> : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ> | |
%int3_1781 = torch.constant.int 3 | |
%int0_1782 = torch.constant.int 0 | |
%int64_1783 = torch.constant.int 64 | |
%int1_1784 = torch.constant.int 1 | |
%2375 = torch.aten.slice.Tensor %2253, %int3_1781, %int0_1782, %int64_1783, %int1_1784 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2375, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 64)> : !torch.vtensor<[4,?,8,64],f8E4M3FNUZ> | |
%int3_1785 = torch.constant.int 3 | |
%int64_1786 = torch.constant.int 64 | |
%int9223372036854775807_1787 = torch.constant.int 9223372036854775807 | |
%int1_1788 = torch.constant.int 1 | |
%2376 = torch.aten.slice.Tensor %2253, %int3_1785, %int64_1786, %int9223372036854775807_1787, %int1_1788 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2376, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 64)> : !torch.vtensor<[4,?,8,64],f8E4M3FNUZ> | |
%2377 = torch.aten.neg %2376 : !torch.vtensor<[4,?,8,64],f8E4M3FNUZ> -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2377, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 64)> : !torch.vtensor<[4,?,8,64],f8E4M3FNUZ> | |
%2378 = torch.prim.ListConstruct %2377, %2375 : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>) -> !torch.list<vtensor> | |
%int-1_1789 = torch.constant.int -1 | |
%2379 = torch.aten.cat %2378, %int-1_1789 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2379, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 128)> : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ> | |
%2380 = torch.aten.mul.Tensor %2379, %2373 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16> -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2380, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 128)> : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ> | |
%int1_1790 = torch.constant.int 1 | |
%2381 = torch.aten.add.Tensor %2374, %2380, %int1_1790 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2381, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 128)> : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ> | |
%int64_1791 = torch.constant.int 64 | |
%2382 = torch.aten.mul.Scalar %arg2, %int64_1791 : !torch.vtensor<[4,?],si64>, !torch.int -> !torch.vtensor<[4,?],si64> | |
torch.bind_symbolic_shape %2382, [%773], affine_map<()[s0] -> (4, s0)> : !torch.vtensor<[4,?],si64> | |
%int8_1792 = torch.constant.int 8 | |
%int1_1793 = torch.constant.int 1 | |
%2383 = torch.aten.add.Scalar %2382, %int8_1792, %int1_1793 : !torch.vtensor<[4,?],si64>, !torch.int, !torch.int -> !torch.vtensor<[4,?],si64> | |
torch.bind_symbolic_shape %2383, [%773], affine_map<()[s0] -> (4, s0)> : !torch.vtensor<[4,?],si64> | |
%int4_1794 = torch.constant.int 4 | |
%int32_1795 = torch.constant.int 32 | |
%int8_1796 = torch.constant.int 8 | |
%int128_1797 = torch.constant.int 128 | |
%2384 = torch.prim.ListConstruct %int4_1794, %775, %int32_1795, %int8_1796, %int128_1797 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%2385 = torch.aten.view %2381, %2384 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2385, [%773], affine_map<()[s0] -> (4, s0, 32, 8, 128)> : !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ> | |
%int32_1798 = torch.constant.int 32 | |
%int8_1799 = torch.constant.int 8 | |
%int128_1800 = torch.constant.int 128 | |
%2386 = torch.prim.ListConstruct %997, %int32_1798, %int8_1799, %int128_1800 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%2387 = torch.aten.view %2385, %2386 : !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2387, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
%2388 = torch.prim.ListConstruct %997 : (!torch.int) -> !torch.list<int> | |
%2389 = torch.aten.view %2383, %2388 : !torch.vtensor<[4,?],si64>, !torch.list<int> -> !torch.vtensor<[?],si64> | |
torch.bind_symbolic_shape %2389, [%773], affine_map<()[s0] -> (s0 * 4)> : !torch.vtensor<[?],si64> | |
%int26_1801 = torch.constant.int 26 | |
%2390 = torch.prims.convert_element_type %2387, %int26_1801 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2390, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
%int1_1802 = torch.constant.int 1 | |
%2391 = torch.aten.view.dtype %2390, %int1_1802 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[?,32,8,128],si8> | |
torch.bind_symbolic_shape %2391, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8> | |
%2392 = torch.aten.detach %2391 : !torch.vtensor<[?,32,8,128],si8> -> !torch.vtensor<[?,32,8,128],si8> | |
torch.bind_symbolic_shape %2392, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8> | |
%2393 = torch.aten.detach %2392 : !torch.vtensor<[?,32,8,128],si8> -> !torch.vtensor<[?,32,8,128],si8> | |
torch.bind_symbolic_shape %2393, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8> | |
%int32_1803 = torch.constant.int 32 | |
%int2_1804 = torch.constant.int 2 | |
%int32_1805 = torch.constant.int 32 | |
%int8_1806 = torch.constant.int 8 | |
%int128_1807 = torch.constant.int 128 | |
%2394 = torch.prim.ListConstruct %776, %int32_1803, %int2_1804, %int32_1805, %int8_1806, %int128_1807 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%2395 = torch.aten.view %2088, %2394 : !torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2395, [%774], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ> | |
%int32_1808 = torch.constant.int 32 | |
%int8_1809 = torch.constant.int 8 | |
%int128_1810 = torch.constant.int 128 | |
%2396 = torch.prim.ListConstruct %990, %int32_1808, %int8_1809, %int128_1810 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%2397 = torch.aten.view %2395, %2396 : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2397, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
%int1_1811 = torch.constant.int 1 | |
%2398 = torch.aten.view.dtype %2397, %int1_1811 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[?,32,8,128],si8> | |
torch.bind_symbolic_shape %2398, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8> | |
%2399 = torch.aten.detach %2398 : !torch.vtensor<[?,32,8,128],si8> -> !torch.vtensor<[?,32,8,128],si8> | |
torch.bind_symbolic_shape %2399, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8> | |
%2400 = torch.aten.detach %2399 : !torch.vtensor<[?,32,8,128],si8> -> !torch.vtensor<[?,32,8,128],si8> | |
torch.bind_symbolic_shape %2400, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8> | |
%2401 = torch.prim.ListConstruct %2389 : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>> | |
%false_1812 = torch.constant.bool false | |
%2402 = torch.aten.index_put %2400, %2401, %2393, %false_1812 : !torch.vtensor<[?,32,8,128],si8>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],si8>, !torch.bool -> !torch.vtensor<[?,32,8,128],si8> | |
torch.bind_symbolic_shape %2402, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8> | |
%int26_1813 = torch.constant.int 26 | |
%2403 = torch.aten.view.dtype %2402, %int26_1813 : !torch.vtensor<[?,32,8,128],si8>, !torch.int -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2403, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
%2404 = torch.aten.detach %2403 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2404, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
%2405 = torch.aten.detach %2404 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2405, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
%int32_1814 = torch.constant.int 32 | |
%int2_1815 = torch.constant.int 2 | |
%int32_1816 = torch.constant.int 32 | |
%int8_1817 = torch.constant.int 8 | |
%int128_1818 = torch.constant.int 128 | |
%2406 = torch.prim.ListConstruct %776, %int32_1814, %int2_1815, %int32_1816, %int8_1817, %int128_1818 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%2407 = torch.aten.view %2405, %2406 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2407, [%774], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ> | |
%int2097152_1819 = torch.constant.int 2097152 | |
%2408 = torch.prim.ListConstruct %776, %int2097152_1819 : (!torch.int, !torch.int) -> !torch.list<int> | |
%2409 = torch.aten.view %2407, %2408 : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,2097152],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2409, [%774], affine_map<()[s0] -> (s0, 2097152)> : !torch.vtensor<[?,2097152],f8E4M3FNUZ> | |
%int4_1820 = torch.constant.int 4 | |
%int32_1821 = torch.constant.int 32 | |
%int8_1822 = torch.constant.int 8 | |
%int128_1823 = torch.constant.int 128 | |
%2410 = torch.prim.ListConstruct %int4_1820, %775, %int32_1821, %int8_1822, %int128_1823 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%2411 = torch.aten.view %2255, %2410 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2411, [%773], affine_map<()[s0] -> (4, s0, 32, 8, 128)> : !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ> | |
%int32_1824 = torch.constant.int 32 | |
%int8_1825 = torch.constant.int 8 | |
%int128_1826 = torch.constant.int 128 | |
%2412 = torch.prim.ListConstruct %997, %int32_1824, %int8_1825, %int128_1826 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%2413 = torch.aten.view %2411, %2412 : !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2413, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
%int1_1827 = torch.constant.int 1 | |
%int1_1828 = torch.constant.int 1 | |
%2414 = torch.aten.add.Scalar %2383, %int1_1827, %int1_1828 : !torch.vtensor<[4,?],si64>, !torch.int, !torch.int -> !torch.vtensor<[4,?],si64> | |
torch.bind_symbolic_shape %2414, [%773], affine_map<()[s0] -> (4, s0)> : !torch.vtensor<[4,?],si64> | |
%2415 = torch.prim.ListConstruct %997 : (!torch.int) -> !torch.list<int> | |
%2416 = torch.aten.view %2414, %2415 : !torch.vtensor<[4,?],si64>, !torch.list<int> -> !torch.vtensor<[?],si64> | |
torch.bind_symbolic_shape %2416, [%773], affine_map<()[s0] -> (s0 * 4)> : !torch.vtensor<[?],si64> | |
%int26_1829 = torch.constant.int 26 | |
%2417 = torch.prims.convert_element_type %2413, %int26_1829 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2417, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
%int1_1830 = torch.constant.int 1 | |
%2418 = torch.aten.view.dtype %2417, %int1_1830 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[?,32,8,128],si8> | |
torch.bind_symbolic_shape %2418, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8> | |
%2419 = torch.aten.detach %2418 : !torch.vtensor<[?,32,8,128],si8> -> !torch.vtensor<[?,32,8,128],si8> | |
torch.bind_symbolic_shape %2419, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8> | |
%2420 = torch.aten.detach %2419 : !torch.vtensor<[?,32,8,128],si8> -> !torch.vtensor<[?,32,8,128],si8> | |
torch.bind_symbolic_shape %2420, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8> | |
%int32_1831 = torch.constant.int 32 | |
%int2_1832 = torch.constant.int 2 | |
%int32_1833 = torch.constant.int 32 | |
%int8_1834 = torch.constant.int 8 | |
%int128_1835 = torch.constant.int 128 | |
%2421 = torch.prim.ListConstruct %776, %int32_1831, %int2_1832, %int32_1833, %int8_1834, %int128_1835 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%2422 = torch.aten.view %2409, %2421 : !torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2422, [%774], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ> | |
%int32_1836 = torch.constant.int 32 | |
%int8_1837 = torch.constant.int 8 | |
%int128_1838 = torch.constant.int 128 | |
%2423 = torch.prim.ListConstruct %990, %int32_1836, %int8_1837, %int128_1838 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%2424 = torch.aten.view %2422, %2423 : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2424, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
%int1_1839 = torch.constant.int 1 | |
%2425 = torch.aten.view.dtype %2424, %int1_1839 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[?,32,8,128],si8> | |
torch.bind_symbolic_shape %2425, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8> | |
%2426 = torch.aten.detach %2425 : !torch.vtensor<[?,32,8,128],si8> -> !torch.vtensor<[?,32,8,128],si8> | |
torch.bind_symbolic_shape %2426, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8> | |
%2427 = torch.aten.detach %2426 : !torch.vtensor<[?,32,8,128],si8> -> !torch.vtensor<[?,32,8,128],si8> | |
torch.bind_symbolic_shape %2427, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8> | |
%2428 = torch.prim.ListConstruct %2416 : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>> | |
%false_1840 = torch.constant.bool false | |
%2429 = torch.aten.index_put %2427, %2428, %2420, %false_1840 : !torch.vtensor<[?,32,8,128],si8>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],si8>, !torch.bool -> !torch.vtensor<[?,32,8,128],si8> | |
torch.bind_symbolic_shape %2429, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8> | |
%int26_1841 = torch.constant.int 26 | |
%2430 = torch.aten.view.dtype %2429, %int26_1841 : !torch.vtensor<[?,32,8,128],si8>, !torch.int -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2430, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
%2431 = torch.aten.detach %2430 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2431, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
%2432 = torch.aten.detach %2431 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2432, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
%int32_1842 = torch.constant.int 32 | |
%int2_1843 = torch.constant.int 2 | |
%int32_1844 = torch.constant.int 32 | |
%int8_1845 = torch.constant.int 8 | |
%int128_1846 = torch.constant.int 128 | |
%2433 = torch.prim.ListConstruct %776, %int32_1842, %int2_1843, %int32_1844, %int8_1845, %int128_1846 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%2434 = torch.aten.view %2432, %2433 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2434, [%774], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ> | |
%int2097152_1847 = torch.constant.int 2097152 | |
%2435 = torch.prim.ListConstruct %776, %int2097152_1847 : (!torch.int, !torch.int) -> !torch.list<int> | |
%2436 = torch.aten.view %2434, %2435 : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,2097152],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2436, [%774], affine_map<()[s0] -> (s0, 2097152)> : !torch.vtensor<[?,2097152],f8E4M3FNUZ> | |
%int-2_1848 = torch.constant.int -2 | |
%2437 = torch.aten.unsqueeze %2381, %int-2_1848 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2437, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 1, 128)> : !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ> | |
%int4_1849 = torch.constant.int 4 | |
%int8_1850 = torch.constant.int 8 | |
%int4_1851 = torch.constant.int 4 | |
%int128_1852 = torch.constant.int 128 | |
%2438 = torch.prim.ListConstruct %int4_1849, %777, %int8_1850, %int4_1851, %int128_1852 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%false_1853 = torch.constant.bool false | |
%2439 = torch.aten.expand %2437, %2438, %false_1853 : !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2439, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 4, 128)> : !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ> | |
%int0_1854 = torch.constant.int 0 | |
%2440 = torch.aten.clone %2439, %int0_1854 : !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2440, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 4, 128)> : !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ> | |
%int4_1855 = torch.constant.int 4 | |
%int32_1856 = torch.constant.int 32 | |
%int128_1857 = torch.constant.int 128 | |
%2441 = torch.prim.ListConstruct %int4_1855, %777, %int32_1856, %int128_1857 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%2442 = torch.aten._unsafe_view %2440, %2441 : !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2442, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ> | |
%int-2_1858 = torch.constant.int -2 | |
%2443 = torch.aten.unsqueeze %2255, %int-2_1858 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2443, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 1, 128)> : !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ> | |
%int4_1859 = torch.constant.int 4 | |
%int8_1860 = torch.constant.int 8 | |
%int4_1861 = torch.constant.int 4 | |
%int128_1862 = torch.constant.int 128 | |
%2444 = torch.prim.ListConstruct %int4_1859, %777, %int8_1860, %int4_1861, %int128_1862 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%false_1863 = torch.constant.bool false | |
%2445 = torch.aten.expand %2443, %2444, %false_1863 : !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2445, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 4, 128)> : !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ> | |
%int0_1864 = torch.constant.int 0 | |
%2446 = torch.aten.clone %2445, %int0_1864 : !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2446, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 4, 128)> : !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ> | |
%int4_1865 = torch.constant.int 4 | |
%int32_1866 = torch.constant.int 32 | |
%int128_1867 = torch.constant.int 128 | |
%2447 = torch.prim.ListConstruct %int4_1865, %777, %int32_1866, %int128_1867 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%2448 = torch.aten._unsafe_view %2446, %2447 : !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2448, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ> | |
%int1_1868 = torch.constant.int 1 | |
%int2_1869 = torch.constant.int 2 | |
%2449 = torch.aten.transpose.int %2318, %int1_1868, %int2_1869 : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2449, [%773], affine_map<()[s0] -> (4, 32, s0 * 32, 128)> : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> | |
%int1_1870 = torch.constant.int 1 | |
%int2_1871 = torch.constant.int 2 | |
%2450 = torch.aten.transpose.int %2442, %int1_1870, %int2_1871 : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2450, [%773], affine_map<()[s0] -> (4, 32, s0 * 32, 128)> : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> | |
%int1_1872 = torch.constant.int 1 | |
%int2_1873 = torch.constant.int 2 | |
%2451 = torch.aten.transpose.int %2448, %int1_1872, %int2_1873 : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2451, [%773], affine_map<()[s0] -> (4, 32, s0 * 32, 128)> : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> | |
%int26_1874 = torch.constant.int 26 | |
%2452 = torch.prims.convert_element_type %2449, %int26_1874 : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2452, [%773], affine_map<()[s0] -> (4, 32, s0 * 32, 128)> : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> | |
%int26_1875 = torch.constant.int 26 | |
%2453 = torch.prims.convert_element_type %2450, %int26_1875 : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2453, [%773], affine_map<()[s0] -> (4, 32, s0 * 32, 128)> : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> | |
%int26_1876 = torch.constant.int 26 | |
%2454 = torch.prims.convert_element_type %2451, %int26_1876 : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2454, [%773], affine_map<()[s0] -> (4, 32, s0 * 32, 128)> : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> | |
%int26_1877 = torch.constant.int 26 | |
%2455 = torch.prims.convert_element_type %803, %int26_1877 : !torch.vtensor<[4,1,?,?],f32>, !torch.int -> !torch.vtensor<[4,1,?,?],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2455, [%773], affine_map<()[s0] -> (4, 1, s0 * 32, s0 * 32)> : !torch.vtensor<[4,1,?,?],f8E4M3FNUZ> | |
%int0_1878 = torch.constant.int 0 | |
%int0_1879 = torch.constant.int 0 | |
%2456 = torch.aten.select.int %2455, %int0_1878, %int0_1879 : !torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[1,?,?],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2456, [%773], affine_map<()[s0] -> (1, s0 * 32, s0 * 32)> : !torch.vtensor<[1,?,?],f8E4M3FNUZ> | |
%int0_1880 = torch.constant.int 0 | |
%int0_1881 = torch.constant.int 0 | |
%2457 = torch.aten.select.int %2456, %int0_1880, %int0_1881 : !torch.vtensor<[1,?,?],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[?,?],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2457, [%773], affine_map<()[s0] -> (s0 * 32, s0 * 32)> : !torch.vtensor<[?,?],f8E4M3FNUZ> | |
%int0_1882 = torch.constant.int 0 | |
%int0_1883 = torch.constant.int 0 | |
%int9223372036854775807_1884 = torch.constant.int 9223372036854775807 | |
%int1_1885 = torch.constant.int 1 | |
%2458 = torch.aten.slice.Tensor %2457, %int0_1882, %int0_1883, %int9223372036854775807_1884, %int1_1885 : !torch.vtensor<[?,?],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,?],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2458, [%773], affine_map<()[s0] -> (s0 * 32, s0 * 32)> : !torch.vtensor<[?,?],f8E4M3FNUZ> | |
%int1_1886 = torch.constant.int 1 | |
%int0_1887 = torch.constant.int 0 | |
%int9223372036854775807_1888 = torch.constant.int 9223372036854775807 | |
%int1_1889 = torch.constant.int 1 | |
%2459 = torch.aten.slice.Tensor %2458, %int1_1886, %int0_1887, %int9223372036854775807_1888, %int1_1889 : !torch.vtensor<[?,?],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,?],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2459, [%773], affine_map<()[s0] -> (s0 * 32, s0 * 32)> : !torch.vtensor<[?,?],f8E4M3FNUZ> | |
%none_1890 = torch.constant.none | |
%2460 = torch.aten.clone %107, %none_1890 : !torch.vtensor<[],f32>, !torch.none -> !torch.vtensor<[],f32> | |
%2461 = torch.aten.detach %2460 : !torch.vtensor<[],f32> -> !torch.vtensor<[],f32> | |
%2462 = torch.aten.detach %2461 : !torch.vtensor<[],f32> -> !torch.vtensor<[],f32> | |
%2463 = torch.aten.detach %2462 : !torch.vtensor<[],f32> -> !torch.vtensor<[],f32> | |
%2464 = torch_c.to_builtin_tensor %2452 : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> -> tensor<4x32x?x128xf8E4M3FNUZ> | |
%2465 = torch_c.to_builtin_tensor %2453 : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> -> tensor<4x32x?x128xf8E4M3FNUZ> | |
%2466 = torch_c.to_builtin_tensor %2454 : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> -> tensor<4x32x?x128xf8E4M3FNUZ> | |
%2467 = torch_c.to_builtin_tensor %2459 : !torch.vtensor<[?,?],f8E4M3FNUZ> -> tensor<?x?xf8E4M3FNUZ> | |
%2468 = torch_c.to_builtin_tensor %2463 : !torch.vtensor<[],f32> -> tensor<f32> | |
%2469 = util.call @sharktank_masked_flash_attention_4_32_128_128_f8E4M3FNUZ_f8E4M3FNUZ_f32_f32(%2464, %2465, %2466, %2468, %2467) : (tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<f32>, tensor<?x?xf8E4M3FNUZ>) -> tensor<4x32x?x128xf32> | |
%2470 = torch_c.from_builtin_tensor %2469 : tensor<4x32x?x128xf32> -> !torch.vtensor<[4,32,?,128],f32> | |
torch.bind_symbolic_shape %2470, [%773], affine_map<()[s0] -> (4, 32, s0 * 32, 128)> : !torch.vtensor<[4,32,?,128],f32> | |
%int1_1891 = torch.constant.int 1 | |
%int2_1892 = torch.constant.int 2 | |
%2471 = torch.aten.transpose.int %2470, %int1_1891, %int2_1892 : !torch.vtensor<[4,32,?,128],f32>, !torch.int, !torch.int -> !torch.vtensor<[4,?,32,128],f32> | |
torch.bind_symbolic_shape %2471, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f32> | |
%int0_1893 = torch.constant.int 0 | |
%2472 = torch.aten.clone %2471, %int0_1893 : !torch.vtensor<[4,?,32,128],f32>, !torch.int -> !torch.vtensor<[4,?,32,128],f32> | |
torch.bind_symbolic_shape %2472, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f32> | |
%int4_1894 = torch.constant.int 4 | |
%int4096_1895 = torch.constant.int 4096 | |
%2473 = torch.prim.ListConstruct %int4_1894, %777, %int4096_1895 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%2474 = torch.aten._unsafe_view %2472, %2473 : !torch.vtensor<[4,?,32,128],f32>, !torch.list<int> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %2474, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%2475 = torch.aten.div.Tensor %2474, %108 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %2475, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%float-2.400000e02_1896 = torch.constant.float -2.400000e+02 | |
%float2.400000e02_1897 = torch.constant.float 2.400000e+02 | |
%2476 = torch.aten.clamp %2475, %float-2.400000e02_1896, %float2.400000e02_1897 : !torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %2476, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int26_1898 = torch.constant.int 26 | |
%2477 = torch.prims.convert_element_type %2476, %int26_1898 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2477, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> | |
%int0_1899 = torch.constant.int 0 | |
%2478 = torch.aten.unsqueeze %109, %int0_1899 : !torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ> | |
%int4_1900 = torch.constant.int 4 | |
%int4096_1901 = torch.constant.int 4096 | |
%int4096_1902 = torch.constant.int 4096 | |
%2479 = torch.prim.ListConstruct %int4_1900, %int4096_1901, %int4096_1902 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%false_1903 = torch.constant.bool false | |
%2480 = torch.aten.expand %2478, %2479, %false_1903 : !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ> | |
%2481 = torch_c.to_builtin_tensor %2477 : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> -> tensor<4x?x4096xf8E4M3FNUZ> | |
%2482 = torch_c.to_builtin_tensor %2480 : !torch.vtensor<[4,4096,4096],f8E4M3FNUZ> -> tensor<4x4096x4096xf8E4M3FNUZ> | |
%2483 = util.call @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ_f32(%2481, %2482) : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x?x4096xf32> | |
%2484 = torch_c.from_builtin_tensor %2483 : tensor<4x?x4096xf32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %2484, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%2485 = torch.prim.ListConstruct : () -> !torch.list<int> | |
%2486 = torch.aten.permute %110, %2485 : !torch.vtensor<[],f32>, !torch.list<int> -> !torch.vtensor<[],f32> | |
%2487 = torch.aten.mul.Tensor %108, %2486 : !torch.vtensor<[],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[],f32> | |
%int6_1904 = torch.constant.int 6 | |
%2488 = torch.prims.convert_element_type %2484, %int6_1904 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %2488, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%2489 = torch.aten.mul.Tensor %2488, %2487 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %2489, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int1_1905 = torch.constant.int 1 | |
%2490 = torch.aten.add.Tensor %2200, %2489, %int1_1905 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %2490, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int6_1906 = torch.constant.int 6 | |
%2491 = torch.prims.convert_element_type %2490, %int6_1906 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %2491, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int2_1907 = torch.constant.int 2 | |
%2492 = torch.aten.pow.Tensor_Scalar %2491, %int2_1907 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %2492, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int-1_1908 = torch.constant.int -1 | |
%2493 = torch.prim.ListConstruct %int-1_1908 : (!torch.int) -> !torch.list<int> | |
%true_1909 = torch.constant.bool true | |
%none_1910 = torch.constant.none | |
%2494 = torch.aten.mean.dim %2492, %2493, %true_1909, %none_1910 : !torch.vtensor<[4,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[4,?,1],f32> | |
torch.bind_symbolic_shape %2494, [%773], affine_map<()[s0] -> (4, s0 * 32, 1)> : !torch.vtensor<[4,?,1],f32> | |
%float1.000000e-05_1911 = torch.constant.float 1.000000e-05 | |
%int1_1912 = torch.constant.int 1 | |
%2495 = torch.aten.add.Scalar %2494, %float1.000000e-05_1911, %int1_1912 : !torch.vtensor<[4,?,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[4,?,1],f32> | |
torch.bind_symbolic_shape %2495, [%773], affine_map<()[s0] -> (4, s0 * 32, 1)> : !torch.vtensor<[4,?,1],f32> | |
%2496 = torch.aten.rsqrt %2495 : !torch.vtensor<[4,?,1],f32> -> !torch.vtensor<[4,?,1],f32> | |
torch.bind_symbolic_shape %2496, [%773], affine_map<()[s0] -> (4, s0 * 32, 1)> : !torch.vtensor<[4,?,1],f32> | |
%2497 = torch.aten.mul.Tensor %2491, %2496 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,1],f32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %2497, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int6_1913 = torch.constant.int 6 | |
%2498 = torch.prims.convert_element_type %2497, %int6_1913 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %2498, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%2499 = torch.aten.mul.Tensor %111, %2498 : !torch.vtensor<[4096],bf16>, !torch.vtensor<[4,?,4096],f32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %2499, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int6_1914 = torch.constant.int 6 | |
%2500 = torch.prims.convert_element_type %2499, %int6_1914 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %2500, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%2501 = torch.aten.div.Tensor %2500, %112 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %2501, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%float-2.400000e02_1915 = torch.constant.float -2.400000e+02 | |
%float2.400000e02_1916 = torch.constant.float 2.400000e+02 | |
%2502 = torch.aten.clamp %2501, %float-2.400000e02_1915, %float2.400000e02_1916 : !torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %2502, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int26_1917 = torch.constant.int 26 | |
%2503 = torch.prims.convert_element_type %2502, %int26_1917 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2503, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> | |
%int0_1918 = torch.constant.int 0 | |
%2504 = torch.aten.unsqueeze %113, %int0_1918 : !torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ> | |
%int4_1919 = torch.constant.int 4 | |
%int14336_1920 = torch.constant.int 14336 | |
%int4096_1921 = torch.constant.int 4096 | |
%2505 = torch.prim.ListConstruct %int4_1919, %int14336_1920, %int4096_1921 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%false_1922 = torch.constant.bool false | |
%2506 = torch.aten.expand %2504, %2505, %false_1922 : !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ> | |
%2507 = torch_c.to_builtin_tensor %2503 : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> -> tensor<4x?x4096xf8E4M3FNUZ> | |
%2508 = torch_c.to_builtin_tensor %2506 : !torch.vtensor<[4,14336,4096],f8E4M3FNUZ> -> tensor<4x14336x4096xf8E4M3FNUZ> | |
%2509 = util.call @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ_f32(%2507, %2508) : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x?x14336xf32> | |
%2510 = torch_c.from_builtin_tensor %2509 : tensor<4x?x14336xf32> -> !torch.vtensor<[4,?,14336],f32> | |
torch.bind_symbolic_shape %2510, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32> | |
%2511 = torch.prim.ListConstruct : () -> !torch.list<int> | |
%2512 = torch.aten.permute %114, %2511 : !torch.vtensor<[],f32>, !torch.list<int> -> !torch.vtensor<[],f32> | |
%2513 = torch.aten.mul.Tensor %112, %2512 : !torch.vtensor<[],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[],f32> | |
%int6_1923 = torch.constant.int 6 | |
%2514 = torch.prims.convert_element_type %2510, %int6_1923 : !torch.vtensor<[4,?,14336],f32>, !torch.int -> !torch.vtensor<[4,?,14336],f32> | |
torch.bind_symbolic_shape %2514, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32> | |
%2515 = torch.aten.mul.Tensor %2514, %2513 : !torch.vtensor<[4,?,14336],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,14336],f32> | |
torch.bind_symbolic_shape %2515, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32> | |
%2516 = torch.aten.silu %2515 : !torch.vtensor<[4,?,14336],f32> -> !torch.vtensor<[4,?,14336],f32> | |
torch.bind_symbolic_shape %2516, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32> | |
%2517 = torch.aten.div.Tensor %2500, %115 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %2517, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%float-2.400000e02_1924 = torch.constant.float -2.400000e+02 | |
%float2.400000e02_1925 = torch.constant.float 2.400000e+02 | |
%2518 = torch.aten.clamp %2517, %float-2.400000e02_1924, %float2.400000e02_1925 : !torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %2518, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int26_1926 = torch.constant.int 26 | |
%2519 = torch.prims.convert_element_type %2518, %int26_1926 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2519, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> | |
%int0_1927 = torch.constant.int 0 | |
%2520 = torch.aten.unsqueeze %116, %int0_1927 : !torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ> | |
%int4_1928 = torch.constant.int 4 | |
%int14336_1929 = torch.constant.int 14336 | |
%int4096_1930 = torch.constant.int 4096 | |
%2521 = torch.prim.ListConstruct %int4_1928, %int14336_1929, %int4096_1930 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%false_1931 = torch.constant.bool false | |
%2522 = torch.aten.expand %2520, %2521, %false_1931 : !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ> | |
%2523 = torch_c.to_builtin_tensor %2519 : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> -> tensor<4x?x4096xf8E4M3FNUZ> | |
%2524 = torch_c.to_builtin_tensor %2522 : !torch.vtensor<[4,14336,4096],f8E4M3FNUZ> -> tensor<4x14336x4096xf8E4M3FNUZ> | |
%2525 = util.call @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ_f32(%2523, %2524) : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x?x14336xf32> | |
%2526 = torch_c.from_builtin_tensor %2525 : tensor<4x?x14336xf32> -> !torch.vtensor<[4,?,14336],f32> | |
torch.bind_symbolic_shape %2526, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32> | |
%2527 = torch.prim.ListConstruct : () -> !torch.list<int> | |
%2528 = torch.aten.permute %117, %2527 : !torch.vtensor<[],f32>, !torch.list<int> -> !torch.vtensor<[],f32> | |
%2529 = torch.aten.mul.Tensor %115, %2528 : !torch.vtensor<[],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[],f32> | |
%int6_1932 = torch.constant.int 6 | |
%2530 = torch.prims.convert_element_type %2526, %int6_1932 : !torch.vtensor<[4,?,14336],f32>, !torch.int -> !torch.vtensor<[4,?,14336],f32> | |
torch.bind_symbolic_shape %2530, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32> | |
%2531 = torch.aten.mul.Tensor %2530, %2529 : !torch.vtensor<[4,?,14336],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,14336],f32> | |
torch.bind_symbolic_shape %2531, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32> | |
%2532 = torch.aten.mul.Tensor %2516, %2531 : !torch.vtensor<[4,?,14336],f32>, !torch.vtensor<[4,?,14336],f32> -> !torch.vtensor<[4,?,14336],f32> | |
torch.bind_symbolic_shape %2532, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32> | |
%2533 = torch.aten.div.Tensor %2532, %118 : !torch.vtensor<[4,?,14336],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,14336],f32> | |
torch.bind_symbolic_shape %2533, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32> | |
%float-2.400000e02_1933 = torch.constant.float -2.400000e+02 | |
%float2.400000e02_1934 = torch.constant.float 2.400000e+02 | |
%2534 = torch.aten.clamp %2533, %float-2.400000e02_1933, %float2.400000e02_1934 : !torch.vtensor<[4,?,14336],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,14336],f32> | |
torch.bind_symbolic_shape %2534, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32> | |
%int26_1935 = torch.constant.int 26 | |
%2535 = torch.prims.convert_element_type %2534, %int26_1935 : !torch.vtensor<[4,?,14336],f32>, !torch.int -> !torch.vtensor<[4,?,14336],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2535, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f8E4M3FNUZ> | |
%int0_1936 = torch.constant.int 0 | |
%2536 = torch.aten.unsqueeze %119, %int0_1936 : !torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,4096,14336],f8E4M3FNUZ> | |
%int4_1937 = torch.constant.int 4 | |
%int4096_1938 = torch.constant.int 4096 | |
%int14336_1939 = torch.constant.int 14336 | |
%2537 = torch.prim.ListConstruct %int4_1937, %int4096_1938, %int14336_1939 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%false_1940 = torch.constant.bool false | |
%2538 = torch.aten.expand %2536, %2537, %false_1940 : !torch.vtensor<[1,4096,14336],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,4096,14336],f8E4M3FNUZ> | |
%2539 = torch_c.to_builtin_tensor %2535 : !torch.vtensor<[4,?,14336],f8E4M3FNUZ> -> tensor<4x?x14336xf8E4M3FNUZ> | |
%2540 = torch_c.to_builtin_tensor %2538 : !torch.vtensor<[4,4096,14336],f8E4M3FNUZ> -> tensor<4x4096x14336xf8E4M3FNUZ> | |
%2541 = util.call @sharktank_batch_matmul_transpose_b_L4xDx14336xf8E4M3FNUZ_R4x4096x14336xf8E4M3FNUZ_f32(%2539, %2540) : (tensor<4x?x14336xf8E4M3FNUZ>, tensor<4x4096x14336xf8E4M3FNUZ>) -> tensor<4x?x4096xf32> | |
%2542 = torch_c.from_builtin_tensor %2541 : tensor<4x?x4096xf32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %2542, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%2543 = torch.prim.ListConstruct : () -> !torch.list<int> | |
%2544 = torch.aten.permute %120, %2543 : !torch.vtensor<[],f32>, !torch.list<int> -> !torch.vtensor<[],f32> | |
%2545 = torch.aten.mul.Tensor %118, %2544 : !torch.vtensor<[],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[],f32> | |
%int6_1941 = torch.constant.int 6 | |
%2546 = torch.prims.convert_element_type %2542, %int6_1941 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %2546, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%2547 = torch.aten.mul.Tensor %2546, %2545 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %2547, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int1_1942 = torch.constant.int 1 | |
%2548 = torch.aten.add.Tensor %2490, %2547, %int1_1942 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %2548, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int6_1943 = torch.constant.int 6 | |
%2549 = torch.prims.convert_element_type %2548, %int6_1943 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %2549, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int2_1944 = torch.constant.int 2 | |
%2550 = torch.aten.pow.Tensor_Scalar %2549, %int2_1944 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %2550, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int-1_1945 = torch.constant.int -1 | |
%2551 = torch.prim.ListConstruct %int-1_1945 : (!torch.int) -> !torch.list<int> | |
%true_1946 = torch.constant.bool true | |
%none_1947 = torch.constant.none | |
%2552 = torch.aten.mean.dim %2550, %2551, %true_1946, %none_1947 : !torch.vtensor<[4,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[4,?,1],f32> | |
torch.bind_symbolic_shape %2552, [%773], affine_map<()[s0] -> (4, s0 * 32, 1)> : !torch.vtensor<[4,?,1],f32> | |
%float1.000000e-05_1948 = torch.constant.float 1.000000e-05 | |
%int1_1949 = torch.constant.int 1 | |
%2553 = torch.aten.add.Scalar %2552, %float1.000000e-05_1948, %int1_1949 : !torch.vtensor<[4,?,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[4,?,1],f32> | |
torch.bind_symbolic_shape %2553, [%773], affine_map<()[s0] -> (4, s0 * 32, 1)> : !torch.vtensor<[4,?,1],f32> | |
%2554 = torch.aten.rsqrt %2553 : !torch.vtensor<[4,?,1],f32> -> !torch.vtensor<[4,?,1],f32> | |
torch.bind_symbolic_shape %2554, [%773], affine_map<()[s0] -> (4, s0 * 32, 1)> : !torch.vtensor<[4,?,1],f32> | |
%2555 = torch.aten.mul.Tensor %2549, %2554 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,1],f32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %2555, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int6_1950 = torch.constant.int 6 | |
%2556 = torch.prims.convert_element_type %2555, %int6_1950 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %2556, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%2557 = torch.aten.mul.Tensor %121, %2556 : !torch.vtensor<[4096],bf16>, !torch.vtensor<[4,?,4096],f32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %2557, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int6_1951 = torch.constant.int 6 | |
%2558 = torch.prims.convert_element_type %2557, %int6_1951 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %2558, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%2559 = torch.aten.div.Tensor %2558, %122 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %2559, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%float-2.400000e02_1952 = torch.constant.float -2.400000e+02 | |
%float2.400000e02_1953 = torch.constant.float 2.400000e+02 | |
%2560 = torch.aten.clamp %2559, %float-2.400000e02_1952, %float2.400000e02_1953 : !torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %2560, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int26_1954 = torch.constant.int 26 | |
%2561 = torch.prims.convert_element_type %2560, %int26_1954 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2561, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> | |
%int0_1955 = torch.constant.int 0 | |
%2562 = torch.aten.unsqueeze %123, %int0_1955 : !torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ> | |
%int4_1956 = torch.constant.int 4 | |
%int4096_1957 = torch.constant.int 4096 | |
%int4096_1958 = torch.constant.int 4096 | |
%2563 = torch.prim.ListConstruct %int4_1956, %int4096_1957, %int4096_1958 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%false_1959 = torch.constant.bool false | |
%2564 = torch.aten.expand %2562, %2563, %false_1959 : !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ> | |
%2565 = torch_c.to_builtin_tensor %2561 : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> -> tensor<4x?x4096xf8E4M3FNUZ> | |
%2566 = torch_c.to_builtin_tensor %2564 : !torch.vtensor<[4,4096,4096],f8E4M3FNUZ> -> tensor<4x4096x4096xf8E4M3FNUZ> | |
%2567 = util.call @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ_f32(%2565, %2566) : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x?x4096xf32> | |
%2568 = torch_c.from_builtin_tensor %2567 : tensor<4x?x4096xf32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %2568, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%2569 = torch.aten.div.Tensor %2568, %124 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %2569, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%float-2.400000e02_1960 = torch.constant.float -2.400000e+02 | |
%float2.400000e02_1961 = torch.constant.float 2.400000e+02 | |
%2570 = torch.aten.clamp %2569, %float-2.400000e02_1960, %float2.400000e02_1961 : !torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %2570, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int26_1962 = torch.constant.int 26 | |
%2571 = torch.prims.convert_element_type %2570, %int26_1962 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2571, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> | |
%2572 = torch.aten.div.Tensor %2558, %125 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %2572, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%float-2.400000e02_1963 = torch.constant.float -2.400000e+02 | |
%float2.400000e02_1964 = torch.constant.float 2.400000e+02 | |
%2573 = torch.aten.clamp %2572, %float-2.400000e02_1963, %float2.400000e02_1964 : !torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %2573, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int26_1965 = torch.constant.int 26 | |
%2574 = torch.prims.convert_element_type %2573, %int26_1965 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2574, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> | |
%int0_1966 = torch.constant.int 0 | |
%2575 = torch.aten.unsqueeze %126, %int0_1966 : !torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ> | |
%int4_1967 = torch.constant.int 4 | |
%int1024_1968 = torch.constant.int 1024 | |
%int4096_1969 = torch.constant.int 4096 | |
%2576 = torch.prim.ListConstruct %int4_1967, %int1024_1968, %int4096_1969 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%false_1970 = torch.constant.bool false | |
%2577 = torch.aten.expand %2575, %2576, %false_1970 : !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ> | |
%2578 = torch_c.to_builtin_tensor %2574 : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> -> tensor<4x?x4096xf8E4M3FNUZ> | |
%2579 = torch_c.to_builtin_tensor %2577 : !torch.vtensor<[4,1024,4096],f8E4M3FNUZ> -> tensor<4x1024x4096xf8E4M3FNUZ> | |
%2580 = util.call @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ_f32(%2578, %2579) : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x?x1024xf32> | |
%2581 = torch_c.from_builtin_tensor %2580 : tensor<4x?x1024xf32> -> !torch.vtensor<[4,?,1024],f32> | |
torch.bind_symbolic_shape %2581, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f32> | |
%2582 = torch.aten.div.Tensor %2581, %127 : !torch.vtensor<[4,?,1024],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,1024],f32> | |
torch.bind_symbolic_shape %2582, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f32> | |
%float-2.400000e02_1971 = torch.constant.float -2.400000e+02 | |
%float2.400000e02_1972 = torch.constant.float 2.400000e+02 | |
%2583 = torch.aten.clamp %2582, %float-2.400000e02_1971, %float2.400000e02_1972 : !torch.vtensor<[4,?,1024],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,1024],f32> | |
torch.bind_symbolic_shape %2583, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f32> | |
%int26_1973 = torch.constant.int 26 | |
%2584 = torch.prims.convert_element_type %2583, %int26_1973 : !torch.vtensor<[4,?,1024],f32>, !torch.int -> !torch.vtensor<[4,?,1024],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2584, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f8E4M3FNUZ> | |
%2585 = torch.aten.div.Tensor %2558, %128 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %2585, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%float-2.400000e02_1974 = torch.constant.float -2.400000e+02 | |
%float2.400000e02_1975 = torch.constant.float 2.400000e+02 | |
%2586 = torch.aten.clamp %2585, %float-2.400000e02_1974, %float2.400000e02_1975 : !torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %2586, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int26_1976 = torch.constant.int 26 | |
%2587 = torch.prims.convert_element_type %2586, %int26_1976 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2587, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> | |
%int0_1977 = torch.constant.int 0 | |
%2588 = torch.aten.unsqueeze %129, %int0_1977 : !torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ> | |
%int4_1978 = torch.constant.int 4 | |
%int1024_1979 = torch.constant.int 1024 | |
%int4096_1980 = torch.constant.int 4096 | |
%2589 = torch.prim.ListConstruct %int4_1978, %int1024_1979, %int4096_1980 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%false_1981 = torch.constant.bool false | |
%2590 = torch.aten.expand %2588, %2589, %false_1981 : !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ> | |
%2591 = torch_c.to_builtin_tensor %2587 : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> -> tensor<4x?x4096xf8E4M3FNUZ> | |
%2592 = torch_c.to_builtin_tensor %2590 : !torch.vtensor<[4,1024,4096],f8E4M3FNUZ> -> tensor<4x1024x4096xf8E4M3FNUZ> | |
%2593 = util.call @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ_f32(%2591, %2592) : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x?x1024xf32> | |
%2594 = torch_c.from_builtin_tensor %2593 : tensor<4x?x1024xf32> -> !torch.vtensor<[4,?,1024],f32> | |
torch.bind_symbolic_shape %2594, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f32> | |
%2595 = torch.aten.div.Tensor %2594, %130 : !torch.vtensor<[4,?,1024],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,1024],f32> | |
torch.bind_symbolic_shape %2595, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f32> | |
%float-2.400000e02_1982 = torch.constant.float -2.400000e+02 | |
%float2.400000e02_1983 = torch.constant.float 2.400000e+02 | |
%2596 = torch.aten.clamp %2595, %float-2.400000e02_1982, %float2.400000e02_1983 : !torch.vtensor<[4,?,1024],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,1024],f32> | |
torch.bind_symbolic_shape %2596, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f32> | |
%int26_1984 = torch.constant.int 26 | |
%2597 = torch.prims.convert_element_type %2596, %int26_1984 : !torch.vtensor<[4,?,1024],f32>, !torch.int -> !torch.vtensor<[4,?,1024],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2597, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f8E4M3FNUZ> | |
%int4_1985 = torch.constant.int 4 | |
%int32_1986 = torch.constant.int 32 | |
%int128_1987 = torch.constant.int 128 | |
%2598 = torch.prim.ListConstruct %int4_1985, %777, %int32_1986, %int128_1987 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%2599 = torch.aten.view %2571, %2598 : !torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2599, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ> | |
%int4_1988 = torch.constant.int 4 | |
%int8_1989 = torch.constant.int 8 | |
%int128_1990 = torch.constant.int 128 | |
%2600 = torch.prim.ListConstruct %int4_1988, %777, %int8_1989, %int128_1990 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%2601 = torch.aten.view %2584, %2600 : !torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2601, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 128)> : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ> | |
%int4_1991 = torch.constant.int 4 | |
%int8_1992 = torch.constant.int 8 | |
%int128_1993 = torch.constant.int 128 | |
%2602 = torch.prim.ListConstruct %int4_1991, %777, %int8_1992, %int128_1993 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%2603 = torch.aten.view %2597, %2602 : !torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2603, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 128)> : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ> | |
%int131072_1994 = torch.constant.int 131072 | |
%none_1995 = torch.constant.none | |
%none_1996 = torch.constant.none | |
%cpu_1997 = torch.constant.device "cpu" | |
%false_1998 = torch.constant.bool false | |
%2604 = torch.aten.arange %int131072_1994, %none_1995, %none_1996, %cpu_1997, %false_1998 : !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[131072],si64> | |
%int0_1999 = torch.constant.int 0 | |
%int128_2000 = torch.constant.int 128 | |
%int2_2001 = torch.constant.int 2 | |
%int4_2002 = torch.constant.int 4 | |
%none_2003 = torch.constant.none | |
%cpu_2004 = torch.constant.device "cpu" | |
%false_2005 = torch.constant.bool false | |
%2605 = torch.aten.arange.start_step %int0_1999, %int128_2000, %int2_2001, %int4_2002, %none_2003, %cpu_2004, %false_2005 : !torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[64],si64> | |
%int6_2006 = torch.constant.int 6 | |
%2606 = torch.prims.convert_element_type %2605, %int6_2006 : !torch.vtensor<[64],si64>, !torch.int -> !torch.vtensor<[64],f32> | |
%int128_2007 = torch.constant.int 128 | |
%2607 = torch.aten.div.Scalar %2606, %int128_2007 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32> | |
%float5.000000e05_2008 = torch.constant.float 5.000000e+05 | |
%2608 = torch.aten.pow.Scalar %float5.000000e05_2008, %2607 : !torch.float, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32> | |
%2609 = torch.aten.reciprocal %2608 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32> | |
%float1.000000e00_2009 = torch.constant.float 1.000000e+00 | |
%2610 = torch.aten.mul.Scalar %2609, %float1.000000e00_2009 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],f32> | |
%2611 = torch.aten.reciprocal %2610 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32> | |
%float6.283190e00_2010 = torch.constant.float 6.2831853071795862 | |
%2612 = torch.aten.mul.Scalar %2611, %float6.283190e00_2010 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],f32> | |
%float8.192000e03_2011 = torch.constant.float 8.192000e+03 | |
%2613 = torch.aten.gt.Scalar %2612, %float8.192000e03_2011 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1> | |
%int8_2012 = torch.constant.int 8 | |
%2614 = torch.aten.div.Scalar %2610, %int8_2012 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32> | |
%2615 = torch.aten.where.self %2613, %2614, %2610 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32> | |
%2616 = torch.aten.reciprocal %2612 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32> | |
%int8192_2013 = torch.constant.int 8192 | |
%2617 = torch.aten.mul.Scalar %2616, %int8192_2013 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32> | |
%int1_2014 = torch.constant.int 1 | |
%int1_2015 = torch.constant.int 1 | |
%2618 = torch.aten.sub.Scalar %2617, %int1_2014, %int1_2015 : !torch.vtensor<[64],f32>, !torch.int, !torch.int -> !torch.vtensor<[64],f32> | |
%int3_2016 = torch.constant.int 3 | |
%2619 = torch.aten.div.Scalar %2618, %int3_2016 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32> | |
%int1_2017 = torch.constant.int 1 | |
%int1_2018 = torch.constant.int 1 | |
%2620 = torch.aten.rsub.Scalar %2619, %int1_2017, %int1_2018 : !torch.vtensor<[64],f32>, !torch.int, !torch.int -> !torch.vtensor<[64],f32> | |
%2621 = torch.aten.mul.Tensor %2620, %2615 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32> | |
%int8_2019 = torch.constant.int 8 | |
%2622 = torch.aten.div.Scalar %2621, %int8_2019 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32> | |
%2623 = torch.aten.mul.Tensor %2619, %2615 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32> | |
%int1_2020 = torch.constant.int 1 | |
%2624 = torch.aten.add.Tensor %2622, %2623, %int1_2020 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32> | |
%float2.048000e03_2021 = torch.constant.float 2.048000e+03 | |
%2625 = torch.aten.lt.Scalar %2612, %float2.048000e03_2021 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1> | |
%2626 = torch.aten.bitwise_not %2625 : !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1> | |
%float8.192000e03_2022 = torch.constant.float 8.192000e+03 | |
%2627 = torch.aten.gt.Scalar %2612, %float8.192000e03_2022 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1> | |
%2628 = torch.aten.bitwise_not %2627 : !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1> | |
%2629 = torch.aten.mul.Tensor %2626, %2628 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1> | |
%2630 = torch.aten.where.self %2629, %2624, %2615 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32> | |
%2631 = torch.prim.ListConstruct %2630, %2630 : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor> | |
%int-1_2023 = torch.constant.int -1 | |
%2632 = torch.aten.cat %2631, %int-1_2023 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[128],f32> | |
%int6_2024 = torch.constant.int 6 | |
%2633 = torch.prims.convert_element_type %2632, %int6_2024 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[128],f32> | |
%int1_2025 = torch.constant.int 1 | |
%2634 = torch.aten.unsqueeze %2604, %int1_2025 : !torch.vtensor<[131072],si64>, !torch.int -> !torch.vtensor<[131072,1],si64> | |
%int6_2026 = torch.constant.int 6 | |
%2635 = torch.prims.convert_element_type %2634, %int6_2026 : !torch.vtensor<[131072,1],si64>, !torch.int -> !torch.vtensor<[131072,1],f32> | |
%int0_2027 = torch.constant.int 0 | |
%2636 = torch.aten.unsqueeze %2633, %int0_2027 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[1,128],f32> | |
%int6_2028 = torch.constant.int 6 | |
%2637 = torch.prims.convert_element_type %2636, %int6_2028 : !torch.vtensor<[1,128],f32>, !torch.int -> !torch.vtensor<[1,128],f32> | |
%2638 = torch.aten.mul.Tensor %2635, %2637 : !torch.vtensor<[131072,1],f32>, !torch.vtensor<[1,128],f32> -> !torch.vtensor<[131072,128],f32> | |
%2639 = torch.aten.cos %2638 : !torch.vtensor<[131072,128],f32> -> !torch.vtensor<[131072,128],f32> | |
%int15_2029 = torch.constant.int 15 | |
%2640 = torch.prims.convert_element_type %2639, %int15_2029 : !torch.vtensor<[131072,128],f32>, !torch.int -> !torch.vtensor<[131072,128],bf16> | |
%2641 = torch.aten.sin %2638 : !torch.vtensor<[131072,128],f32> -> !torch.vtensor<[131072,128],f32> | |
%int15_2030 = torch.constant.int 15 | |
%2642 = torch.prims.convert_element_type %2641, %int15_2030 : !torch.vtensor<[131072,128],f32>, !torch.int -> !torch.vtensor<[131072,128],bf16> | |
%int0_2031 = torch.constant.int 0 | |
%int0_2032 = torch.constant.int 0 | |
%int1_2033 = torch.constant.int 1 | |
%2643 = torch.aten.slice.Tensor %2640, %int0_2031, %int0_2032, %777, %int1_2033 : !torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16> | |
torch.bind_symbolic_shape %2643, [%773], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16> | |
%int1_2034 = torch.constant.int 1 | |
%int0_2035 = torch.constant.int 0 | |
%int9223372036854775807_2036 = torch.constant.int 9223372036854775807 | |
%int1_2037 = torch.constant.int 1 | |
%2644 = torch.aten.slice.Tensor %2643, %int1_2034, %int0_2035, %int9223372036854775807_2036, %int1_2037 : !torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16> | |
torch.bind_symbolic_shape %2644, [%773], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16> | |
%int0_2038 = torch.constant.int 0 | |
%int0_2039 = torch.constant.int 0 | |
%int1_2040 = torch.constant.int 1 | |
%2645 = torch.aten.slice.Tensor %2642, %int0_2038, %int0_2039, %777, %int1_2040 : !torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16> | |
torch.bind_symbolic_shape %2645, [%773], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16> | |
%int1_2041 = torch.constant.int 1 | |
%int0_2042 = torch.constant.int 0 | |
%int9223372036854775807_2043 = torch.constant.int 9223372036854775807 | |
%int1_2044 = torch.constant.int 1 | |
%2646 = torch.aten.slice.Tensor %2645, %int1_2041, %int0_2042, %int9223372036854775807_2043, %int1_2044 : !torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16> | |
torch.bind_symbolic_shape %2646, [%773], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16> | |
%int0_2045 = torch.constant.int 0 | |
%2647 = torch.aten.unsqueeze %2644, %int0_2045 : !torch.vtensor<[?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,128],bf16> | |
torch.bind_symbolic_shape %2647, [%773], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16> | |
%int1_2046 = torch.constant.int 1 | |
%int0_2047 = torch.constant.int 0 | |
%int9223372036854775807_2048 = torch.constant.int 9223372036854775807 | |
%int1_2049 = torch.constant.int 1 | |
%2648 = torch.aten.slice.Tensor %2647, %int1_2046, %int0_2047, %int9223372036854775807_2048, %int1_2049 : !torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],bf16> | |
torch.bind_symbolic_shape %2648, [%773], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16> | |
%int2_2050 = torch.constant.int 2 | |
%2649 = torch.aten.unsqueeze %2648, %int2_2050 : !torch.vtensor<[1,?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,1,128],bf16> | |
torch.bind_symbolic_shape %2649, [%773], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16> | |
%int3_2051 = torch.constant.int 3 | |
%int0_2052 = torch.constant.int 0 | |
%int9223372036854775807_2053 = torch.constant.int 9223372036854775807 | |
%int1_2054 = torch.constant.int 1 | |
%2650 = torch.aten.slice.Tensor %2649, %int3_2051, %int0_2052, %int9223372036854775807_2053, %int1_2054 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,1,128],bf16> | |
torch.bind_symbolic_shape %2650, [%773], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16> | |
%int4_2055 = torch.constant.int 4 | |
%int1_2056 = torch.constant.int 1 | |
%int1_2057 = torch.constant.int 1 | |
%int1_2058 = torch.constant.int 1 | |
%2651 = torch.prim.ListConstruct %int4_2055, %int1_2056, %int1_2057, %int1_2058 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%2652 = torch.aten.repeat %2650, %2651 : !torch.vtensor<[1,?,1,128],bf16>, !torch.list<int> -> !torch.vtensor<[4,?,1,128],bf16> | |
torch.bind_symbolic_shape %2652, [%773], affine_map<()[s0] -> (4, s0 * 32, 1, 128)> : !torch.vtensor<[4,?,1,128],bf16> | |
%int0_2059 = torch.constant.int 0 | |
%2653 = torch.aten.unsqueeze %2646, %int0_2059 : !torch.vtensor<[?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,128],bf16> | |
torch.bind_symbolic_shape %2653, [%773], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16> | |
%int1_2060 = torch.constant.int 1 | |
%int0_2061 = torch.constant.int 0 | |
%int9223372036854775807_2062 = torch.constant.int 9223372036854775807 | |
%int1_2063 = torch.constant.int 1 | |
%2654 = torch.aten.slice.Tensor %2653, %int1_2060, %int0_2061, %int9223372036854775807_2062, %int1_2063 : !torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],bf16> | |
torch.bind_symbolic_shape %2654, [%773], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16> | |
%int2_2064 = torch.constant.int 2 | |
%2655 = torch.aten.unsqueeze %2654, %int2_2064 : !torch.vtensor<[1,?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,1,128],bf16> | |
torch.bind_symbolic_shape %2655, [%773], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16> | |
%int3_2065 = torch.constant.int 3 | |
%int0_2066 = torch.constant.int 0 | |
%int9223372036854775807_2067 = torch.constant.int 9223372036854775807 | |
%int1_2068 = torch.constant.int 1 | |
%2656 = torch.aten.slice.Tensor %2655, %int3_2065, %int0_2066, %int9223372036854775807_2067, %int1_2068 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,1,128],bf16> | |
torch.bind_symbolic_shape %2656, [%773], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16> | |
%int4_2069 = torch.constant.int 4 | |
%int1_2070 = torch.constant.int 1 | |
%int1_2071 = torch.constant.int 1 | |
%int1_2072 = torch.constant.int 1 | |
%2657 = torch.prim.ListConstruct %int4_2069, %int1_2070, %int1_2071, %int1_2072 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%2658 = torch.aten.repeat %2656, %2657 : !torch.vtensor<[1,?,1,128],bf16>, !torch.list<int> -> !torch.vtensor<[4,?,1,128],bf16> | |
torch.bind_symbolic_shape %2658, [%773], affine_map<()[s0] -> (4, s0 * 32, 1, 128)> : !torch.vtensor<[4,?,1,128],bf16> | |
%2659 = torch.aten.mul.Tensor %2599, %2652 : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16> -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2659, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ> | |
%int3_2073 = torch.constant.int 3 | |
%int0_2074 = torch.constant.int 0 | |
%int64_2075 = torch.constant.int 64 | |
%int1_2076 = torch.constant.int 1 | |
%2660 = torch.aten.slice.Tensor %2599, %int3_2073, %int0_2074, %int64_2075, %int1_2076 : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2660, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 64)> : !torch.vtensor<[4,?,32,64],f8E4M3FNUZ> | |
%int3_2077 = torch.constant.int 3 | |
%int64_2078 = torch.constant.int 64 | |
%int9223372036854775807_2079 = torch.constant.int 9223372036854775807 | |
%int1_2080 = torch.constant.int 1 | |
%2661 = torch.aten.slice.Tensor %2599, %int3_2077, %int64_2078, %int9223372036854775807_2079, %int1_2080 : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2661, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 64)> : !torch.vtensor<[4,?,32,64],f8E4M3FNUZ> | |
%2662 = torch.aten.neg %2661 : !torch.vtensor<[4,?,32,64],f8E4M3FNUZ> -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2662, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 64)> : !torch.vtensor<[4,?,32,64],f8E4M3FNUZ> | |
%2663 = torch.prim.ListConstruct %2662, %2660 : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>) -> !torch.list<vtensor> | |
%int-1_2081 = torch.constant.int -1 | |
%2664 = torch.aten.cat %2663, %int-1_2081 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2664, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ> | |
%2665 = torch.aten.mul.Tensor %2664, %2658 : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16> -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2665, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ> | |
%int1_2082 = torch.constant.int 1 | |
%2666 = torch.aten.add.Tensor %2659, %2665, %int1_2082 : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2666, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ> | |
%int131072_2083 = torch.constant.int 131072 | |
%none_2084 = torch.constant.none | |
%none_2085 = torch.constant.none | |
%cpu_2086 = torch.constant.device "cpu" | |
%false_2087 = torch.constant.bool false | |
%2667 = torch.aten.arange %int131072_2083, %none_2084, %none_2085, %cpu_2086, %false_2087 : !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[131072],si64> | |
%int0_2088 = torch.constant.int 0 | |
%int128_2089 = torch.constant.int 128 | |
%int2_2090 = torch.constant.int 2 | |
%int4_2091 = torch.constant.int 4 | |
%none_2092 = torch.constant.none | |
%cpu_2093 = torch.constant.device "cpu" | |
%false_2094 = torch.constant.bool false | |
%2668 = torch.aten.arange.start_step %int0_2088, %int128_2089, %int2_2090, %int4_2091, %none_2092, %cpu_2093, %false_2094 : !torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[64],si64> | |
%int6_2095 = torch.constant.int 6 | |
%2669 = torch.prims.convert_element_type %2668, %int6_2095 : !torch.vtensor<[64],si64>, !torch.int -> !torch.vtensor<[64],f32> | |
%int128_2096 = torch.constant.int 128 | |
%2670 = torch.aten.div.Scalar %2669, %int128_2096 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32> | |
%float5.000000e05_2097 = torch.constant.float 5.000000e+05 | |
%2671 = torch.aten.pow.Scalar %float5.000000e05_2097, %2670 : !torch.float, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32> | |
%2672 = torch.aten.reciprocal %2671 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32> | |
%float1.000000e00_2098 = torch.constant.float 1.000000e+00 | |
%2673 = torch.aten.mul.Scalar %2672, %float1.000000e00_2098 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],f32> | |
%2674 = torch.aten.reciprocal %2673 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32> | |
%float6.283190e00_2099 = torch.constant.float 6.2831853071795862 | |
%2675 = torch.aten.mul.Scalar %2674, %float6.283190e00_2099 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],f32> | |
%float8.192000e03_2100 = torch.constant.float 8.192000e+03 | |
%2676 = torch.aten.gt.Scalar %2675, %float8.192000e03_2100 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1> | |
%int8_2101 = torch.constant.int 8 | |
%2677 = torch.aten.div.Scalar %2673, %int8_2101 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32> | |
%2678 = torch.aten.where.self %2676, %2677, %2673 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32> | |
%2679 = torch.aten.reciprocal %2675 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32> | |
%int8192_2102 = torch.constant.int 8192 | |
%2680 = torch.aten.mul.Scalar %2679, %int8192_2102 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32> | |
%int1_2103 = torch.constant.int 1 | |
%int1_2104 = torch.constant.int 1 | |
%2681 = torch.aten.sub.Scalar %2680, %int1_2103, %int1_2104 : !torch.vtensor<[64],f32>, !torch.int, !torch.int -> !torch.vtensor<[64],f32> | |
%int3_2105 = torch.constant.int 3 | |
%2682 = torch.aten.div.Scalar %2681, %int3_2105 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32> | |
%int1_2106 = torch.constant.int 1 | |
%int1_2107 = torch.constant.int 1 | |
%2683 = torch.aten.rsub.Scalar %2682, %int1_2106, %int1_2107 : !torch.vtensor<[64],f32>, !torch.int, !torch.int -> !torch.vtensor<[64],f32> | |
%2684 = torch.aten.mul.Tensor %2683, %2678 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32> | |
%int8_2108 = torch.constant.int 8 | |
%2685 = torch.aten.div.Scalar %2684, %int8_2108 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32> | |
%2686 = torch.aten.mul.Tensor %2682, %2678 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32> | |
%int1_2109 = torch.constant.int 1 | |
%2687 = torch.aten.add.Tensor %2685, %2686, %int1_2109 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32> | |
%float2.048000e03_2110 = torch.constant.float 2.048000e+03 | |
%2688 = torch.aten.lt.Scalar %2675, %float2.048000e03_2110 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1> | |
%2689 = torch.aten.bitwise_not %2688 : !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1> | |
%float8.192000e03_2111 = torch.constant.float 8.192000e+03 | |
%2690 = torch.aten.gt.Scalar %2675, %float8.192000e03_2111 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1> | |
%2691 = torch.aten.bitwise_not %2690 : !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1> | |
%2692 = torch.aten.mul.Tensor %2689, %2691 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1> | |
%2693 = torch.aten.where.self %2692, %2687, %2678 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32> | |
%2694 = torch.prim.ListConstruct %2693, %2693 : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor> | |
%int-1_2112 = torch.constant.int -1 | |
%2695 = torch.aten.cat %2694, %int-1_2112 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[128],f32> | |
%int6_2113 = torch.constant.int 6 | |
%2696 = torch.prims.convert_element_type %2695, %int6_2113 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[128],f32> | |
%int1_2114 = torch.constant.int 1 | |
%2697 = torch.aten.unsqueeze %2667, %int1_2114 : !torch.vtensor<[131072],si64>, !torch.int -> !torch.vtensor<[131072,1],si64> | |
%int6_2115 = torch.constant.int 6 | |
%2698 = torch.prims.convert_element_type %2697, %int6_2115 : !torch.vtensor<[131072,1],si64>, !torch.int -> !torch.vtensor<[131072,1],f32> | |
%int0_2116 = torch.constant.int 0 | |
%2699 = torch.aten.unsqueeze %2696, %int0_2116 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[1,128],f32> | |
%int6_2117 = torch.constant.int 6 | |
%2700 = torch.prims.convert_element_type %2699, %int6_2117 : !torch.vtensor<[1,128],f32>, !torch.int -> !torch.vtensor<[1,128],f32> | |
%2701 = torch.aten.mul.Tensor %2698, %2700 : !torch.vtensor<[131072,1],f32>, !torch.vtensor<[1,128],f32> -> !torch.vtensor<[131072,128],f32> | |
%2702 = torch.aten.cos %2701 : !torch.vtensor<[131072,128],f32> -> !torch.vtensor<[131072,128],f32> | |
%int15_2118 = torch.constant.int 15 | |
%2703 = torch.prims.convert_element_type %2702, %int15_2118 : !torch.vtensor<[131072,128],f32>, !torch.int -> !torch.vtensor<[131072,128],bf16> | |
%2704 = torch.aten.sin %2701 : !torch.vtensor<[131072,128],f32> -> !torch.vtensor<[131072,128],f32> | |
%int15_2119 = torch.constant.int 15 | |
%2705 = torch.prims.convert_element_type %2704, %int15_2119 : !torch.vtensor<[131072,128],f32>, !torch.int -> !torch.vtensor<[131072,128],bf16> | |
%int0_2120 = torch.constant.int 0 | |
%int0_2121 = torch.constant.int 0 | |
%int1_2122 = torch.constant.int 1 | |
%2706 = torch.aten.slice.Tensor %2703, %int0_2120, %int0_2121, %777, %int1_2122 : !torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16> | |
torch.bind_symbolic_shape %2706, [%773], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16> | |
%int1_2123 = torch.constant.int 1 | |
%int0_2124 = torch.constant.int 0 | |
%int9223372036854775807_2125 = torch.constant.int 9223372036854775807 | |
%int1_2126 = torch.constant.int 1 | |
%2707 = torch.aten.slice.Tensor %2706, %int1_2123, %int0_2124, %int9223372036854775807_2125, %int1_2126 : !torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16> | |
torch.bind_symbolic_shape %2707, [%773], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16> | |
%int0_2127 = torch.constant.int 0 | |
%int0_2128 = torch.constant.int 0 | |
%int1_2129 = torch.constant.int 1 | |
%2708 = torch.aten.slice.Tensor %2705, %int0_2127, %int0_2128, %777, %int1_2129 : !torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16> | |
torch.bind_symbolic_shape %2708, [%773], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16> | |
%int1_2130 = torch.constant.int 1 | |
%int0_2131 = torch.constant.int 0 | |
%int9223372036854775807_2132 = torch.constant.int 9223372036854775807 | |
%int1_2133 = torch.constant.int 1 | |
%2709 = torch.aten.slice.Tensor %2708, %int1_2130, %int0_2131, %int9223372036854775807_2132, %int1_2133 : !torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16> | |
torch.bind_symbolic_shape %2709, [%773], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16> | |
%int0_2134 = torch.constant.int 0 | |
%2710 = torch.aten.unsqueeze %2707, %int0_2134 : !torch.vtensor<[?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,128],bf16> | |
torch.bind_symbolic_shape %2710, [%773], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16> | |
%int1_2135 = torch.constant.int 1 | |
%int0_2136 = torch.constant.int 0 | |
%int9223372036854775807_2137 = torch.constant.int 9223372036854775807 | |
%int1_2138 = torch.constant.int 1 | |
%2711 = torch.aten.slice.Tensor %2710, %int1_2135, %int0_2136, %int9223372036854775807_2137, %int1_2138 : !torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],bf16> | |
torch.bind_symbolic_shape %2711, [%773], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16> | |
%int2_2139 = torch.constant.int 2 | |
%2712 = torch.aten.unsqueeze %2711, %int2_2139 : !torch.vtensor<[1,?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,1,128],bf16> | |
torch.bind_symbolic_shape %2712, [%773], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16> | |
%int3_2140 = torch.constant.int 3 | |
%int0_2141 = torch.constant.int 0 | |
%int9223372036854775807_2142 = torch.constant.int 9223372036854775807 | |
%int1_2143 = torch.constant.int 1 | |
%2713 = torch.aten.slice.Tensor %2712, %int3_2140, %int0_2141, %int9223372036854775807_2142, %int1_2143 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,1,128],bf16> | |
torch.bind_symbolic_shape %2713, [%773], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16> | |
%int4_2144 = torch.constant.int 4 | |
%int1_2145 = torch.constant.int 1 | |
%int1_2146 = torch.constant.int 1 | |
%int1_2147 = torch.constant.int 1 | |
%2714 = torch.prim.ListConstruct %int4_2144, %int1_2145, %int1_2146, %int1_2147 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%2715 = torch.aten.repeat %2713, %2714 : !torch.vtensor<[1,?,1,128],bf16>, !torch.list<int> -> !torch.vtensor<[4,?,1,128],bf16> | |
torch.bind_symbolic_shape %2715, [%773], affine_map<()[s0] -> (4, s0 * 32, 1, 128)> : !torch.vtensor<[4,?,1,128],bf16> | |
%int0_2148 = torch.constant.int 0 | |
%2716 = torch.aten.unsqueeze %2709, %int0_2148 : !torch.vtensor<[?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,128],bf16> | |
torch.bind_symbolic_shape %2716, [%773], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16> | |
%int1_2149 = torch.constant.int 1 | |
%int0_2150 = torch.constant.int 0 | |
%int9223372036854775807_2151 = torch.constant.int 9223372036854775807 | |
%int1_2152 = torch.constant.int 1 | |
%2717 = torch.aten.slice.Tensor %2716, %int1_2149, %int0_2150, %int9223372036854775807_2151, %int1_2152 : !torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],bf16> | |
torch.bind_symbolic_shape %2717, [%773], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16> | |
%int2_2153 = torch.constant.int 2 | |
%2718 = torch.aten.unsqueeze %2717, %int2_2153 : !torch.vtensor<[1,?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,1,128],bf16> | |
torch.bind_symbolic_shape %2718, [%773], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16> | |
%int3_2154 = torch.constant.int 3 | |
%int0_2155 = torch.constant.int 0 | |
%int9223372036854775807_2156 = torch.constant.int 9223372036854775807 | |
%int1_2157 = torch.constant.int 1 | |
%2719 = torch.aten.slice.Tensor %2718, %int3_2154, %int0_2155, %int9223372036854775807_2156, %int1_2157 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,1,128],bf16> | |
torch.bind_symbolic_shape %2719, [%773], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16> | |
%int4_2158 = torch.constant.int 4 | |
%int1_2159 = torch.constant.int 1 | |
%int1_2160 = torch.constant.int 1 | |
%int1_2161 = torch.constant.int 1 | |
%2720 = torch.prim.ListConstruct %int4_2158, %int1_2159, %int1_2160, %int1_2161 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%2721 = torch.aten.repeat %2719, %2720 : !torch.vtensor<[1,?,1,128],bf16>, !torch.list<int> -> !torch.vtensor<[4,?,1,128],bf16> | |
torch.bind_symbolic_shape %2721, [%773], affine_map<()[s0] -> (4, s0 * 32, 1, 128)> : !torch.vtensor<[4,?,1,128],bf16> | |
%2722 = torch.aten.mul.Tensor %2601, %2715 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16> -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2722, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 128)> : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ> | |
%int3_2162 = torch.constant.int 3 | |
%int0_2163 = torch.constant.int 0 | |
%int64_2164 = torch.constant.int 64 | |
%int1_2165 = torch.constant.int 1 | |
%2723 = torch.aten.slice.Tensor %2601, %int3_2162, %int0_2163, %int64_2164, %int1_2165 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2723, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 64)> : !torch.vtensor<[4,?,8,64],f8E4M3FNUZ> | |
%int3_2166 = torch.constant.int 3 | |
%int64_2167 = torch.constant.int 64 | |
%int9223372036854775807_2168 = torch.constant.int 9223372036854775807 | |
%int1_2169 = torch.constant.int 1 | |
%2724 = torch.aten.slice.Tensor %2601, %int3_2166, %int64_2167, %int9223372036854775807_2168, %int1_2169 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2724, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 64)> : !torch.vtensor<[4,?,8,64],f8E4M3FNUZ> | |
%2725 = torch.aten.neg %2724 : !torch.vtensor<[4,?,8,64],f8E4M3FNUZ> -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2725, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 64)> : !torch.vtensor<[4,?,8,64],f8E4M3FNUZ> | |
%2726 = torch.prim.ListConstruct %2725, %2723 : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>) -> !torch.list<vtensor> | |
%int-1_2170 = torch.constant.int -1 | |
%2727 = torch.aten.cat %2726, %int-1_2170 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2727, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 128)> : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ> | |
%2728 = torch.aten.mul.Tensor %2727, %2721 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16> -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2728, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 128)> : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ> | |
%int1_2171 = torch.constant.int 1 | |
%2729 = torch.aten.add.Tensor %2722, %2728, %int1_2171 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2729, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 128)> : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ> | |
%int64_2172 = torch.constant.int 64 | |
%2730 = torch.aten.mul.Scalar %arg2, %int64_2172 : !torch.vtensor<[4,?],si64>, !torch.int -> !torch.vtensor<[4,?],si64> | |
torch.bind_symbolic_shape %2730, [%773], affine_map<()[s0] -> (4, s0)> : !torch.vtensor<[4,?],si64> | |
%int10 = torch.constant.int 10 | |
%int1_2173 = torch.constant.int 1 | |
%2731 = torch.aten.add.Scalar %2730, %int10, %int1_2173 : !torch.vtensor<[4,?],si64>, !torch.int, !torch.int -> !torch.vtensor<[4,?],si64> | |
torch.bind_symbolic_shape %2731, [%773], affine_map<()[s0] -> (4, s0)> : !torch.vtensor<[4,?],si64> | |
%int4_2174 = torch.constant.int 4 | |
%int32_2175 = torch.constant.int 32 | |
%int8_2176 = torch.constant.int 8 | |
%int128_2177 = torch.constant.int 128 | |
%2732 = torch.prim.ListConstruct %int4_2174, %775, %int32_2175, %int8_2176, %int128_2177 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%2733 = torch.aten.view %2729, %2732 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2733, [%773], affine_map<()[s0] -> (4, s0, 32, 8, 128)> : !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ> | |
%int32_2178 = torch.constant.int 32 | |
%int8_2179 = torch.constant.int 8 | |
%int128_2180 = torch.constant.int 128 | |
%2734 = torch.prim.ListConstruct %997, %int32_2178, %int8_2179, %int128_2180 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%2735 = torch.aten.view %2733, %2734 : !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2735, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
%2736 = torch.prim.ListConstruct %997 : (!torch.int) -> !torch.list<int> | |
%2737 = torch.aten.view %2731, %2736 : !torch.vtensor<[4,?],si64>, !torch.list<int> -> !torch.vtensor<[?],si64> | |
torch.bind_symbolic_shape %2737, [%773], affine_map<()[s0] -> (s0 * 4)> : !torch.vtensor<[?],si64> | |
%int26_2181 = torch.constant.int 26 | |
%2738 = torch.prims.convert_element_type %2735, %int26_2181 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2738, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
%int1_2182 = torch.constant.int 1 | |
%2739 = torch.aten.view.dtype %2738, %int1_2182 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[?,32,8,128],si8> | |
torch.bind_symbolic_shape %2739, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8> | |
%2740 = torch.aten.detach %2739 : !torch.vtensor<[?,32,8,128],si8> -> !torch.vtensor<[?,32,8,128],si8> | |
torch.bind_symbolic_shape %2740, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8> | |
%2741 = torch.aten.detach %2740 : !torch.vtensor<[?,32,8,128],si8> -> !torch.vtensor<[?,32,8,128],si8> | |
torch.bind_symbolic_shape %2741, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8> | |
%int32_2183 = torch.constant.int 32 | |
%int2_2184 = torch.constant.int 2 | |
%int32_2185 = torch.constant.int 32 | |
%int8_2186 = torch.constant.int 8 | |
%int128_2187 = torch.constant.int 128 | |
%2742 = torch.prim.ListConstruct %776, %int32_2183, %int2_2184, %int32_2185, %int8_2186, %int128_2187 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%2743 = torch.aten.view %2436, %2742 : !torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2743, [%774], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ> | |
%int32_2188 = torch.constant.int 32 | |
%int8_2189 = torch.constant.int 8 | |
%int128_2190 = torch.constant.int 128 | |
%2744 = torch.prim.ListConstruct %990, %int32_2188, %int8_2189, %int128_2190 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%2745 = torch.aten.view %2743, %2744 : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2745, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
%int1_2191 = torch.constant.int 1 | |
%2746 = torch.aten.view.dtype %2745, %int1_2191 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[?,32,8,128],si8> | |
torch.bind_symbolic_shape %2746, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8> | |
%2747 = torch.aten.detach %2746 : !torch.vtensor<[?,32,8,128],si8> -> !torch.vtensor<[?,32,8,128],si8> | |
torch.bind_symbolic_shape %2747, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8> | |
%2748 = torch.aten.detach %2747 : !torch.vtensor<[?,32,8,128],si8> -> !torch.vtensor<[?,32,8,128],si8> | |
torch.bind_symbolic_shape %2748, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8> | |
%2749 = torch.prim.ListConstruct %2737 : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>> | |
%false_2192 = torch.constant.bool false | |
%2750 = torch.aten.index_put %2748, %2749, %2741, %false_2192 : !torch.vtensor<[?,32,8,128],si8>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],si8>, !torch.bool -> !torch.vtensor<[?,32,8,128],si8> | |
torch.bind_symbolic_shape %2750, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8> | |
%int26_2193 = torch.constant.int 26 | |
%2751 = torch.aten.view.dtype %2750, %int26_2193 : !torch.vtensor<[?,32,8,128],si8>, !torch.int -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2751, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
%2752 = torch.aten.detach %2751 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2752, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
%2753 = torch.aten.detach %2752 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2753, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
%int32_2194 = torch.constant.int 32 | |
%int2_2195 = torch.constant.int 2 | |
%int32_2196 = torch.constant.int 32 | |
%int8_2197 = torch.constant.int 8 | |
%int128_2198 = torch.constant.int 128 | |
%2754 = torch.prim.ListConstruct %776, %int32_2194, %int2_2195, %int32_2196, %int8_2197, %int128_2198 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%2755 = torch.aten.view %2753, %2754 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2755, [%774], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ> | |
%int2097152_2199 = torch.constant.int 2097152 | |
%2756 = torch.prim.ListConstruct %776, %int2097152_2199 : (!torch.int, !torch.int) -> !torch.list<int> | |
%2757 = torch.aten.view %2755, %2756 : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,2097152],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2757, [%774], affine_map<()[s0] -> (s0, 2097152)> : !torch.vtensor<[?,2097152],f8E4M3FNUZ> | |
%int4_2200 = torch.constant.int 4 | |
%int32_2201 = torch.constant.int 32 | |
%int8_2202 = torch.constant.int 8 | |
%int128_2203 = torch.constant.int 128 | |
%2758 = torch.prim.ListConstruct %int4_2200, %775, %int32_2201, %int8_2202, %int128_2203 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%2759 = torch.aten.view %2603, %2758 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2759, [%773], affine_map<()[s0] -> (4, s0, 32, 8, 128)> : !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ> | |
%int32_2204 = torch.constant.int 32 | |
%int8_2205 = torch.constant.int 8 | |
%int128_2206 = torch.constant.int 128 | |
%2760 = torch.prim.ListConstruct %997, %int32_2204, %int8_2205, %int128_2206 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%2761 = torch.aten.view %2759, %2760 : !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2761, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
%int1_2207 = torch.constant.int 1 | |
%int1_2208 = torch.constant.int 1 | |
%2762 = torch.aten.add.Scalar %2731, %int1_2207, %int1_2208 : !torch.vtensor<[4,?],si64>, !torch.int, !torch.int -> !torch.vtensor<[4,?],si64> | |
torch.bind_symbolic_shape %2762, [%773], affine_map<()[s0] -> (4, s0)> : !torch.vtensor<[4,?],si64> | |
%2763 = torch.prim.ListConstruct %997 : (!torch.int) -> !torch.list<int> | |
%2764 = torch.aten.view %2762, %2763 : !torch.vtensor<[4,?],si64>, !torch.list<int> -> !torch.vtensor<[?],si64> | |
torch.bind_symbolic_shape %2764, [%773], affine_map<()[s0] -> (s0 * 4)> : !torch.vtensor<[?],si64> | |
%int26_2209 = torch.constant.int 26 | |
%2765 = torch.prims.convert_element_type %2761, %int26_2209 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2765, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
%int1_2210 = torch.constant.int 1 | |
%2766 = torch.aten.view.dtype %2765, %int1_2210 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[?,32,8,128],si8> | |
torch.bind_symbolic_shape %2766, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8> | |
%2767 = torch.aten.detach %2766 : !torch.vtensor<[?,32,8,128],si8> -> !torch.vtensor<[?,32,8,128],si8> | |
torch.bind_symbolic_shape %2767, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8> | |
%2768 = torch.aten.detach %2767 : !torch.vtensor<[?,32,8,128],si8> -> !torch.vtensor<[?,32,8,128],si8> | |
torch.bind_symbolic_shape %2768, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8> | |
%int32_2211 = torch.constant.int 32 | |
%int2_2212 = torch.constant.int 2 | |
%int32_2213 = torch.constant.int 32 | |
%int8_2214 = torch.constant.int 8 | |
%int128_2215 = torch.constant.int 128 | |
%2769 = torch.prim.ListConstruct %776, %int32_2211, %int2_2212, %int32_2213, %int8_2214, %int128_2215 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%2770 = torch.aten.view %2757, %2769 : !torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2770, [%774], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ> | |
%int32_2216 = torch.constant.int 32 | |
%int8_2217 = torch.constant.int 8 | |
%int128_2218 = torch.constant.int 128 | |
%2771 = torch.prim.ListConstruct %990, %int32_2216, %int8_2217, %int128_2218 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%2772 = torch.aten.view %2770, %2771 : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2772, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
%int1_2219 = torch.constant.int 1 | |
%2773 = torch.aten.view.dtype %2772, %int1_2219 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[?,32,8,128],si8> | |
torch.bind_symbolic_shape %2773, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8> | |
%2774 = torch.aten.detach %2773 : !torch.vtensor<[?,32,8,128],si8> -> !torch.vtensor<[?,32,8,128],si8> | |
torch.bind_symbolic_shape %2774, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8> | |
%2775 = torch.aten.detach %2774 : !torch.vtensor<[?,32,8,128],si8> -> !torch.vtensor<[?,32,8,128],si8> | |
torch.bind_symbolic_shape %2775, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8> | |
%2776 = torch.prim.ListConstruct %2764 : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>> | |
%false_2220 = torch.constant.bool false | |
%2777 = torch.aten.index_put %2775, %2776, %2768, %false_2220 : !torch.vtensor<[?,32,8,128],si8>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],si8>, !torch.bool -> !torch.vtensor<[?,32,8,128],si8> | |
torch.bind_symbolic_shape %2777, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8> | |
%int26_2221 = torch.constant.int 26 | |
%2778 = torch.aten.view.dtype %2777, %int26_2221 : !torch.vtensor<[?,32,8,128],si8>, !torch.int -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2778, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
%2779 = torch.aten.detach %2778 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2779, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
%2780 = torch.aten.detach %2779 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2780, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
%int32_2222 = torch.constant.int 32 | |
%int2_2223 = torch.constant.int 2 | |
%int32_2224 = torch.constant.int 32 | |
%int8_2225 = torch.constant.int 8 | |
%int128_2226 = torch.constant.int 128 | |
%2781 = torch.prim.ListConstruct %776, %int32_2222, %int2_2223, %int32_2224, %int8_2225, %int128_2226 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%2782 = torch.aten.view %2780, %2781 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2782, [%774], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ> | |
%int2097152_2227 = torch.constant.int 2097152 | |
%2783 = torch.prim.ListConstruct %776, %int2097152_2227 : (!torch.int, !torch.int) -> !torch.list<int> | |
%2784 = torch.aten.view %2782, %2783 : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,2097152],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2784, [%774], affine_map<()[s0] -> (s0, 2097152)> : !torch.vtensor<[?,2097152],f8E4M3FNUZ> | |
%int-2_2228 = torch.constant.int -2 | |
%2785 = torch.aten.unsqueeze %2729, %int-2_2228 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2785, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 1, 128)> : !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ> | |
%int4_2229 = torch.constant.int 4 | |
%int8_2230 = torch.constant.int 8 | |
%int4_2231 = torch.constant.int 4 | |
%int128_2232 = torch.constant.int 128 | |
%2786 = torch.prim.ListConstruct %int4_2229, %777, %int8_2230, %int4_2231, %int128_2232 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%false_2233 = torch.constant.bool false | |
%2787 = torch.aten.expand %2785, %2786, %false_2233 : !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2787, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 4, 128)> : !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ> | |
%int0_2234 = torch.constant.int 0 | |
%2788 = torch.aten.clone %2787, %int0_2234 : !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2788, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 4, 128)> : !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ> | |
%int4_2235 = torch.constant.int 4 | |
%int32_2236 = torch.constant.int 32 | |
%int128_2237 = torch.constant.int 128 | |
%2789 = torch.prim.ListConstruct %int4_2235, %777, %int32_2236, %int128_2237 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%2790 = torch.aten._unsafe_view %2788, %2789 : !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2790, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ> | |
%int-2_2238 = torch.constant.int -2 | |
%2791 = torch.aten.unsqueeze %2603, %int-2_2238 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2791, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 1, 128)> : !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ> | |
%int4_2239 = torch.constant.int 4 | |
%int8_2240 = torch.constant.int 8 | |
%int4_2241 = torch.constant.int 4 | |
%int128_2242 = torch.constant.int 128 | |
%2792 = torch.prim.ListConstruct %int4_2239, %777, %int8_2240, %int4_2241, %int128_2242 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%false_2243 = torch.constant.bool false | |
%2793 = torch.aten.expand %2791, %2792, %false_2243 : !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2793, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 4, 128)> : !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ> | |
%int0_2244 = torch.constant.int 0 | |
%2794 = torch.aten.clone %2793, %int0_2244 : !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2794, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 4, 128)> : !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ> | |
%int4_2245 = torch.constant.int 4 | |
%int32_2246 = torch.constant.int 32 | |
%int128_2247 = torch.constant.int 128 | |
%2795 = torch.prim.ListConstruct %int4_2245, %777, %int32_2246, %int128_2247 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%2796 = torch.aten._unsafe_view %2794, %2795 : !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2796, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ> | |
%int1_2248 = torch.constant.int 1 | |
%int2_2249 = torch.constant.int 2 | |
%2797 = torch.aten.transpose.int %2666, %int1_2248, %int2_2249 : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2797, [%773], affine_map<()[s0] -> (4, 32, s0 * 32, 128)> : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> | |
%int1_2250 = torch.constant.int 1 | |
%int2_2251 = torch.constant.int 2 | |
%2798 = torch.aten.transpose.int %2790, %int1_2250, %int2_2251 : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2798, [%773], affine_map<()[s0] -> (4, 32, s0 * 32, 128)> : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> | |
%int1_2252 = torch.constant.int 1 | |
%int2_2253 = torch.constant.int 2 | |
%2799 = torch.aten.transpose.int %2796, %int1_2252, %int2_2253 : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2799, [%773], affine_map<()[s0] -> (4, 32, s0 * 32, 128)> : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> | |
%int26_2254 = torch.constant.int 26 | |
%2800 = torch.prims.convert_element_type %2797, %int26_2254 : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2800, [%773], affine_map<()[s0] -> (4, 32, s0 * 32, 128)> : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> | |
%int26_2255 = torch.constant.int 26 | |
%2801 = torch.prims.convert_element_type %2798, %int26_2255 : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2801, [%773], affine_map<()[s0] -> (4, 32, s0 * 32, 128)> : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> | |
%int26_2256 = torch.constant.int 26 | |
%2802 = torch.prims.convert_element_type %2799, %int26_2256 : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2802, [%773], affine_map<()[s0] -> (4, 32, s0 * 32, 128)> : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> | |
%int26_2257 = torch.constant.int 26 | |
%2803 = torch.prims.convert_element_type %803, %int26_2257 : !torch.vtensor<[4,1,?,?],f32>, !torch.int -> !torch.vtensor<[4,1,?,?],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2803, [%773], affine_map<()[s0] -> (4, 1, s0 * 32, s0 * 32)> : !torch.vtensor<[4,1,?,?],f8E4M3FNUZ> | |
%int0_2258 = torch.constant.int 0 | |
%int0_2259 = torch.constant.int 0 | |
%2804 = torch.aten.select.int %2803, %int0_2258, %int0_2259 : !torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[1,?,?],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2804, [%773], affine_map<()[s0] -> (1, s0 * 32, s0 * 32)> : !torch.vtensor<[1,?,?],f8E4M3FNUZ> | |
%int0_2260 = torch.constant.int 0 | |
%int0_2261 = torch.constant.int 0 | |
%2805 = torch.aten.select.int %2804, %int0_2260, %int0_2261 : !torch.vtensor<[1,?,?],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[?,?],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2805, [%773], affine_map<()[s0] -> (s0 * 32, s0 * 32)> : !torch.vtensor<[?,?],f8E4M3FNUZ> | |
%int0_2262 = torch.constant.int 0 | |
%int0_2263 = torch.constant.int 0 | |
%int9223372036854775807_2264 = torch.constant.int 9223372036854775807 | |
%int1_2265 = torch.constant.int 1 | |
%2806 = torch.aten.slice.Tensor %2805, %int0_2262, %int0_2263, %int9223372036854775807_2264, %int1_2265 : !torch.vtensor<[?,?],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,?],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2806, [%773], affine_map<()[s0] -> (s0 * 32, s0 * 32)> : !torch.vtensor<[?,?],f8E4M3FNUZ> | |
%int1_2266 = torch.constant.int 1 | |
%int0_2267 = torch.constant.int 0 | |
%int9223372036854775807_2268 = torch.constant.int 9223372036854775807 | |
%int1_2269 = torch.constant.int 1 | |
%2807 = torch.aten.slice.Tensor %2806, %int1_2266, %int0_2267, %int9223372036854775807_2268, %int1_2269 : !torch.vtensor<[?,?],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,?],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2807, [%773], affine_map<()[s0] -> (s0 * 32, s0 * 32)> : !torch.vtensor<[?,?],f8E4M3FNUZ> | |
%none_2270 = torch.constant.none | |
%2808 = torch.aten.clone %131, %none_2270 : !torch.vtensor<[],f32>, !torch.none -> !torch.vtensor<[],f32> | |
%2809 = torch.aten.detach %2808 : !torch.vtensor<[],f32> -> !torch.vtensor<[],f32> | |
%2810 = torch.aten.detach %2809 : !torch.vtensor<[],f32> -> !torch.vtensor<[],f32> | |
%2811 = torch.aten.detach %2810 : !torch.vtensor<[],f32> -> !torch.vtensor<[],f32> | |
%2812 = torch_c.to_builtin_tensor %2800 : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> -> tensor<4x32x?x128xf8E4M3FNUZ> | |
%2813 = torch_c.to_builtin_tensor %2801 : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> -> tensor<4x32x?x128xf8E4M3FNUZ> | |
%2814 = torch_c.to_builtin_tensor %2802 : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> -> tensor<4x32x?x128xf8E4M3FNUZ> | |
%2815 = torch_c.to_builtin_tensor %2807 : !torch.vtensor<[?,?],f8E4M3FNUZ> -> tensor<?x?xf8E4M3FNUZ> | |
%2816 = torch_c.to_builtin_tensor %2811 : !torch.vtensor<[],f32> -> tensor<f32> | |
%2817 = util.call @sharktank_masked_flash_attention_4_32_128_128_f8E4M3FNUZ_f8E4M3FNUZ_f32_f32(%2812, %2813, %2814, %2816, %2815) : (tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<f32>, tensor<?x?xf8E4M3FNUZ>) -> tensor<4x32x?x128xf32> | |
%2818 = torch_c.from_builtin_tensor %2817 : tensor<4x32x?x128xf32> -> !torch.vtensor<[4,32,?,128],f32> | |
torch.bind_symbolic_shape %2818, [%773], affine_map<()[s0] -> (4, 32, s0 * 32, 128)> : !torch.vtensor<[4,32,?,128],f32> | |
%int1_2271 = torch.constant.int 1 | |
%int2_2272 = torch.constant.int 2 | |
%2819 = torch.aten.transpose.int %2818, %int1_2271, %int2_2272 : !torch.vtensor<[4,32,?,128],f32>, !torch.int, !torch.int -> !torch.vtensor<[4,?,32,128],f32> | |
torch.bind_symbolic_shape %2819, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f32> | |
%int0_2273 = torch.constant.int 0 | |
%2820 = torch.aten.clone %2819, %int0_2273 : !torch.vtensor<[4,?,32,128],f32>, !torch.int -> !torch.vtensor<[4,?,32,128],f32> | |
torch.bind_symbolic_shape %2820, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f32> | |
%int4_2274 = torch.constant.int 4 | |
%int4096_2275 = torch.constant.int 4096 | |
%2821 = torch.prim.ListConstruct %int4_2274, %777, %int4096_2275 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%2822 = torch.aten._unsafe_view %2820, %2821 : !torch.vtensor<[4,?,32,128],f32>, !torch.list<int> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %2822, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%2823 = torch.aten.div.Tensor %2822, %132 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %2823, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%float-2.400000e02_2276 = torch.constant.float -2.400000e+02 | |
%float2.400000e02_2277 = torch.constant.float 2.400000e+02 | |
%2824 = torch.aten.clamp %2823, %float-2.400000e02_2276, %float2.400000e02_2277 : !torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %2824, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int26_2278 = torch.constant.int 26 | |
%2825 = torch.prims.convert_element_type %2824, %int26_2278 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2825, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> | |
%int0_2279 = torch.constant.int 0 | |
%2826 = torch.aten.unsqueeze %133, %int0_2279 : !torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ> | |
%int4_2280 = torch.constant.int 4 | |
%int4096_2281 = torch.constant.int 4096 | |
%int4096_2282 = torch.constant.int 4096 | |
%2827 = torch.prim.ListConstruct %int4_2280, %int4096_2281, %int4096_2282 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%false_2283 = torch.constant.bool false | |
%2828 = torch.aten.expand %2826, %2827, %false_2283 : !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ> | |
%2829 = torch_c.to_builtin_tensor %2825 : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> -> tensor<4x?x4096xf8E4M3FNUZ> | |
%2830 = torch_c.to_builtin_tensor %2828 : !torch.vtensor<[4,4096,4096],f8E4M3FNUZ> -> tensor<4x4096x4096xf8E4M3FNUZ> | |
%2831 = util.call @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ_f32(%2829, %2830) : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x?x4096xf32> | |
%2832 = torch_c.from_builtin_tensor %2831 : tensor<4x?x4096xf32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %2832, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%2833 = torch.prim.ListConstruct : () -> !torch.list<int> | |
%2834 = torch.aten.permute %134, %2833 : !torch.vtensor<[],f32>, !torch.list<int> -> !torch.vtensor<[],f32> | |
%2835 = torch.aten.mul.Tensor %132, %2834 : !torch.vtensor<[],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[],f32> | |
%int6_2284 = torch.constant.int 6 | |
%2836 = torch.prims.convert_element_type %2832, %int6_2284 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %2836, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%2837 = torch.aten.mul.Tensor %2836, %2835 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %2837, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int1_2285 = torch.constant.int 1 | |
%2838 = torch.aten.add.Tensor %2548, %2837, %int1_2285 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %2838, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int6_2286 = torch.constant.int 6 | |
%2839 = torch.prims.convert_element_type %2838, %int6_2286 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %2839, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int2_2287 = torch.constant.int 2 | |
%2840 = torch.aten.pow.Tensor_Scalar %2839, %int2_2287 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %2840, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int-1_2288 = torch.constant.int -1 | |
%2841 = torch.prim.ListConstruct %int-1_2288 : (!torch.int) -> !torch.list<int> | |
%true_2289 = torch.constant.bool true | |
%none_2290 = torch.constant.none | |
%2842 = torch.aten.mean.dim %2840, %2841, %true_2289, %none_2290 : !torch.vtensor<[4,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[4,?,1],f32> | |
torch.bind_symbolic_shape %2842, [%773], affine_map<()[s0] -> (4, s0 * 32, 1)> : !torch.vtensor<[4,?,1],f32> | |
%float1.000000e-05_2291 = torch.constant.float 1.000000e-05 | |
%int1_2292 = torch.constant.int 1 | |
%2843 = torch.aten.add.Scalar %2842, %float1.000000e-05_2291, %int1_2292 : !torch.vtensor<[4,?,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[4,?,1],f32> | |
torch.bind_symbolic_shape %2843, [%773], affine_map<()[s0] -> (4, s0 * 32, 1)> : !torch.vtensor<[4,?,1],f32> | |
%2844 = torch.aten.rsqrt %2843 : !torch.vtensor<[4,?,1],f32> -> !torch.vtensor<[4,?,1],f32> | |
torch.bind_symbolic_shape %2844, [%773], affine_map<()[s0] -> (4, s0 * 32, 1)> : !torch.vtensor<[4,?,1],f32> | |
%2845 = torch.aten.mul.Tensor %2839, %2844 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,1],f32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %2845, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int6_2293 = torch.constant.int 6 | |
%2846 = torch.prims.convert_element_type %2845, %int6_2293 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %2846, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%2847 = torch.aten.mul.Tensor %135, %2846 : !torch.vtensor<[4096],bf16>, !torch.vtensor<[4,?,4096],f32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %2847, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int6_2294 = torch.constant.int 6 | |
%2848 = torch.prims.convert_element_type %2847, %int6_2294 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %2848, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%2849 = torch.aten.div.Tensor %2848, %136 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %2849, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%float-2.400000e02_2295 = torch.constant.float -2.400000e+02 | |
%float2.400000e02_2296 = torch.constant.float 2.400000e+02 | |
%2850 = torch.aten.clamp %2849, %float-2.400000e02_2295, %float2.400000e02_2296 : !torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %2850, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int26_2297 = torch.constant.int 26 | |
%2851 = torch.prims.convert_element_type %2850, %int26_2297 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2851, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> | |
%int0_2298 = torch.constant.int 0 | |
%2852 = torch.aten.unsqueeze %137, %int0_2298 : !torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ> | |
%int4_2299 = torch.constant.int 4 | |
%int14336_2300 = torch.constant.int 14336 | |
%int4096_2301 = torch.constant.int 4096 | |
%2853 = torch.prim.ListConstruct %int4_2299, %int14336_2300, %int4096_2301 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%false_2302 = torch.constant.bool false | |
%2854 = torch.aten.expand %2852, %2853, %false_2302 : !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ> | |
%2855 = torch_c.to_builtin_tensor %2851 : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> -> tensor<4x?x4096xf8E4M3FNUZ> | |
%2856 = torch_c.to_builtin_tensor %2854 : !torch.vtensor<[4,14336,4096],f8E4M3FNUZ> -> tensor<4x14336x4096xf8E4M3FNUZ> | |
%2857 = util.call @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ_f32(%2855, %2856) : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x?x14336xf32> | |
%2858 = torch_c.from_builtin_tensor %2857 : tensor<4x?x14336xf32> -> !torch.vtensor<[4,?,14336],f32> | |
torch.bind_symbolic_shape %2858, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32> | |
%2859 = torch.prim.ListConstruct : () -> !torch.list<int> | |
%2860 = torch.aten.permute %138, %2859 : !torch.vtensor<[],f32>, !torch.list<int> -> !torch.vtensor<[],f32> | |
%2861 = torch.aten.mul.Tensor %136, %2860 : !torch.vtensor<[],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[],f32> | |
%int6_2303 = torch.constant.int 6 | |
%2862 = torch.prims.convert_element_type %2858, %int6_2303 : !torch.vtensor<[4,?,14336],f32>, !torch.int -> !torch.vtensor<[4,?,14336],f32> | |
torch.bind_symbolic_shape %2862, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32> | |
%2863 = torch.aten.mul.Tensor %2862, %2861 : !torch.vtensor<[4,?,14336],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,14336],f32> | |
torch.bind_symbolic_shape %2863, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32> | |
%2864 = torch.aten.silu %2863 : !torch.vtensor<[4,?,14336],f32> -> !torch.vtensor<[4,?,14336],f32> | |
torch.bind_symbolic_shape %2864, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32> | |
%2865 = torch.aten.div.Tensor %2848, %139 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %2865, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%float-2.400000e02_2304 = torch.constant.float -2.400000e+02 | |
%float2.400000e02_2305 = torch.constant.float 2.400000e+02 | |
%2866 = torch.aten.clamp %2865, %float-2.400000e02_2304, %float2.400000e02_2305 : !torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %2866, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int26_2306 = torch.constant.int 26 | |
%2867 = torch.prims.convert_element_type %2866, %int26_2306 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2867, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> | |
%int0_2307 = torch.constant.int 0 | |
%2868 = torch.aten.unsqueeze %140, %int0_2307 : !torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ> | |
%int4_2308 = torch.constant.int 4 | |
%int14336_2309 = torch.constant.int 14336 | |
%int4096_2310 = torch.constant.int 4096 | |
%2869 = torch.prim.ListConstruct %int4_2308, %int14336_2309, %int4096_2310 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%false_2311 = torch.constant.bool false | |
%2870 = torch.aten.expand %2868, %2869, %false_2311 : !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ> | |
%2871 = torch_c.to_builtin_tensor %2867 : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> -> tensor<4x?x4096xf8E4M3FNUZ> | |
%2872 = torch_c.to_builtin_tensor %2870 : !torch.vtensor<[4,14336,4096],f8E4M3FNUZ> -> tensor<4x14336x4096xf8E4M3FNUZ> | |
%2873 = util.call @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ_f32(%2871, %2872) : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x?x14336xf32> | |
%2874 = torch_c.from_builtin_tensor %2873 : tensor<4x?x14336xf32> -> !torch.vtensor<[4,?,14336],f32> | |
torch.bind_symbolic_shape %2874, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32> | |
%2875 = torch.prim.ListConstruct : () -> !torch.list<int> | |
%2876 = torch.aten.permute %141, %2875 : !torch.vtensor<[],f32>, !torch.list<int> -> !torch.vtensor<[],f32> | |
%2877 = torch.aten.mul.Tensor %139, %2876 : !torch.vtensor<[],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[],f32> | |
%int6_2312 = torch.constant.int 6 | |
%2878 = torch.prims.convert_element_type %2874, %int6_2312 : !torch.vtensor<[4,?,14336],f32>, !torch.int -> !torch.vtensor<[4,?,14336],f32> | |
torch.bind_symbolic_shape %2878, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32> | |
%2879 = torch.aten.mul.Tensor %2878, %2877 : !torch.vtensor<[4,?,14336],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,14336],f32> | |
torch.bind_symbolic_shape %2879, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32> | |
%2880 = torch.aten.mul.Tensor %2864, %2879 : !torch.vtensor<[4,?,14336],f32>, !torch.vtensor<[4,?,14336],f32> -> !torch.vtensor<[4,?,14336],f32> | |
torch.bind_symbolic_shape %2880, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32> | |
%2881 = torch.aten.div.Tensor %2880, %142 : !torch.vtensor<[4,?,14336],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,14336],f32> | |
torch.bind_symbolic_shape %2881, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32> | |
%float-2.400000e02_2313 = torch.constant.float -2.400000e+02 | |
%float2.400000e02_2314 = torch.constant.float 2.400000e+02 | |
%2882 = torch.aten.clamp %2881, %float-2.400000e02_2313, %float2.400000e02_2314 : !torch.vtensor<[4,?,14336],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,14336],f32> | |
torch.bind_symbolic_shape %2882, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32> | |
%int26_2315 = torch.constant.int 26 | |
%2883 = torch.prims.convert_element_type %2882, %int26_2315 : !torch.vtensor<[4,?,14336],f32>, !torch.int -> !torch.vtensor<[4,?,14336],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2883, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f8E4M3FNUZ> | |
%int0_2316 = torch.constant.int 0 | |
%2884 = torch.aten.unsqueeze %143, %int0_2316 : !torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,4096,14336],f8E4M3FNUZ> | |
%int4_2317 = torch.constant.int 4 | |
%int4096_2318 = torch.constant.int 4096 | |
%int14336_2319 = torch.constant.int 14336 | |
%2885 = torch.prim.ListConstruct %int4_2317, %int4096_2318, %int14336_2319 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%false_2320 = torch.constant.bool false | |
%2886 = torch.aten.expand %2884, %2885, %false_2320 : !torch.vtensor<[1,4096,14336],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,4096,14336],f8E4M3FNUZ> | |
%2887 = torch_c.to_builtin_tensor %2883 : !torch.vtensor<[4,?,14336],f8E4M3FNUZ> -> tensor<4x?x14336xf8E4M3FNUZ> | |
%2888 = torch_c.to_builtin_tensor %2886 : !torch.vtensor<[4,4096,14336],f8E4M3FNUZ> -> tensor<4x4096x14336xf8E4M3FNUZ> | |
%2889 = util.call @sharktank_batch_matmul_transpose_b_L4xDx14336xf8E4M3FNUZ_R4x4096x14336xf8E4M3FNUZ_f32(%2887, %2888) : (tensor<4x?x14336xf8E4M3FNUZ>, tensor<4x4096x14336xf8E4M3FNUZ>) -> tensor<4x?x4096xf32> | |
%2890 = torch_c.from_builtin_tensor %2889 : tensor<4x?x4096xf32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %2890, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%2891 = torch.prim.ListConstruct : () -> !torch.list<int> | |
%2892 = torch.aten.permute %144, %2891 : !torch.vtensor<[],f32>, !torch.list<int> -> !torch.vtensor<[],f32> | |
%2893 = torch.aten.mul.Tensor %142, %2892 : !torch.vtensor<[],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[],f32> | |
%int6_2321 = torch.constant.int 6 | |
%2894 = torch.prims.convert_element_type %2890, %int6_2321 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %2894, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%2895 = torch.aten.mul.Tensor %2894, %2893 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %2895, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int1_2322 = torch.constant.int 1 | |
%2896 = torch.aten.add.Tensor %2838, %2895, %int1_2322 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %2896, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int6_2323 = torch.constant.int 6 | |
%2897 = torch.prims.convert_element_type %2896, %int6_2323 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %2897, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int2_2324 = torch.constant.int 2 | |
%2898 = torch.aten.pow.Tensor_Scalar %2897, %int2_2324 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %2898, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int-1_2325 = torch.constant.int -1 | |
%2899 = torch.prim.ListConstruct %int-1_2325 : (!torch.int) -> !torch.list<int> | |
%true_2326 = torch.constant.bool true | |
%none_2327 = torch.constant.none | |
%2900 = torch.aten.mean.dim %2898, %2899, %true_2326, %none_2327 : !torch.vtensor<[4,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[4,?,1],f32> | |
torch.bind_symbolic_shape %2900, [%773], affine_map<()[s0] -> (4, s0 * 32, 1)> : !torch.vtensor<[4,?,1],f32> | |
%float1.000000e-05_2328 = torch.constant.float 1.000000e-05 | |
%int1_2329 = torch.constant.int 1 | |
%2901 = torch.aten.add.Scalar %2900, %float1.000000e-05_2328, %int1_2329 : !torch.vtensor<[4,?,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[4,?,1],f32> | |
torch.bind_symbolic_shape %2901, [%773], affine_map<()[s0] -> (4, s0 * 32, 1)> : !torch.vtensor<[4,?,1],f32> | |
%2902 = torch.aten.rsqrt %2901 : !torch.vtensor<[4,?,1],f32> -> !torch.vtensor<[4,?,1],f32> | |
torch.bind_symbolic_shape %2902, [%773], affine_map<()[s0] -> (4, s0 * 32, 1)> : !torch.vtensor<[4,?,1],f32> | |
%2903 = torch.aten.mul.Tensor %2897, %2902 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,1],f32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %2903, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int6_2330 = torch.constant.int 6 | |
%2904 = torch.prims.convert_element_type %2903, %int6_2330 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %2904, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%2905 = torch.aten.mul.Tensor %145, %2904 : !torch.vtensor<[4096],bf16>, !torch.vtensor<[4,?,4096],f32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %2905, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int6_2331 = torch.constant.int 6 | |
%2906 = torch.prims.convert_element_type %2905, %int6_2331 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %2906, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%2907 = torch.aten.div.Tensor %2906, %146 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %2907, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%float-2.400000e02_2332 = torch.constant.float -2.400000e+02 | |
%float2.400000e02_2333 = torch.constant.float 2.400000e+02 | |
%2908 = torch.aten.clamp %2907, %float-2.400000e02_2332, %float2.400000e02_2333 : !torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %2908, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int26_2334 = torch.constant.int 26 | |
%2909 = torch.prims.convert_element_type %2908, %int26_2334 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2909, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> | |
%int0_2335 = torch.constant.int 0 | |
%2910 = torch.aten.unsqueeze %147, %int0_2335 : !torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ> | |
%int4_2336 = torch.constant.int 4 | |
%int4096_2337 = torch.constant.int 4096 | |
%int4096_2338 = torch.constant.int 4096 | |
%2911 = torch.prim.ListConstruct %int4_2336, %int4096_2337, %int4096_2338 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%false_2339 = torch.constant.bool false | |
%2912 = torch.aten.expand %2910, %2911, %false_2339 : !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ> | |
%2913 = torch_c.to_builtin_tensor %2909 : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> -> tensor<4x?x4096xf8E4M3FNUZ> | |
%2914 = torch_c.to_builtin_tensor %2912 : !torch.vtensor<[4,4096,4096],f8E4M3FNUZ> -> tensor<4x4096x4096xf8E4M3FNUZ> | |
%2915 = util.call @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ_f32(%2913, %2914) : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x?x4096xf32> | |
%2916 = torch_c.from_builtin_tensor %2915 : tensor<4x?x4096xf32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %2916, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%2917 = torch.aten.div.Tensor %2916, %148 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %2917, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%float-2.400000e02_2340 = torch.constant.float -2.400000e+02 | |
%float2.400000e02_2341 = torch.constant.float 2.400000e+02 | |
%2918 = torch.aten.clamp %2917, %float-2.400000e02_2340, %float2.400000e02_2341 : !torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %2918, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int26_2342 = torch.constant.int 26 | |
%2919 = torch.prims.convert_element_type %2918, %int26_2342 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2919, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> | |
%2920 = torch.aten.div.Tensor %2906, %149 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %2920, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%float-2.400000e02_2343 = torch.constant.float -2.400000e+02 | |
%float2.400000e02_2344 = torch.constant.float 2.400000e+02 | |
%2921 = torch.aten.clamp %2920, %float-2.400000e02_2343, %float2.400000e02_2344 : !torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %2921, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int26_2345 = torch.constant.int 26 | |
%2922 = torch.prims.convert_element_type %2921, %int26_2345 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2922, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> | |
%int0_2346 = torch.constant.int 0 | |
%2923 = torch.aten.unsqueeze %150, %int0_2346 : !torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ> | |
%int4_2347 = torch.constant.int 4 | |
%int1024_2348 = torch.constant.int 1024 | |
%int4096_2349 = torch.constant.int 4096 | |
%2924 = torch.prim.ListConstruct %int4_2347, %int1024_2348, %int4096_2349 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%false_2350 = torch.constant.bool false | |
%2925 = torch.aten.expand %2923, %2924, %false_2350 : !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ> | |
%2926 = torch_c.to_builtin_tensor %2922 : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> -> tensor<4x?x4096xf8E4M3FNUZ> | |
%2927 = torch_c.to_builtin_tensor %2925 : !torch.vtensor<[4,1024,4096],f8E4M3FNUZ> -> tensor<4x1024x4096xf8E4M3FNUZ> | |
%2928 = util.call @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ_f32(%2926, %2927) : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x?x1024xf32> | |
%2929 = torch_c.from_builtin_tensor %2928 : tensor<4x?x1024xf32> -> !torch.vtensor<[4,?,1024],f32> | |
torch.bind_symbolic_shape %2929, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f32> | |
%2930 = torch.aten.div.Tensor %2929, %151 : !torch.vtensor<[4,?,1024],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,1024],f32> | |
torch.bind_symbolic_shape %2930, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f32> | |
%float-2.400000e02_2351 = torch.constant.float -2.400000e+02 | |
%float2.400000e02_2352 = torch.constant.float 2.400000e+02 | |
%2931 = torch.aten.clamp %2930, %float-2.400000e02_2351, %float2.400000e02_2352 : !torch.vtensor<[4,?,1024],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,1024],f32> | |
torch.bind_symbolic_shape %2931, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f32> | |
%int26_2353 = torch.constant.int 26 | |
%2932 = torch.prims.convert_element_type %2931, %int26_2353 : !torch.vtensor<[4,?,1024],f32>, !torch.int -> !torch.vtensor<[4,?,1024],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2932, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f8E4M3FNUZ> | |
%2933 = torch.aten.div.Tensor %2906, %152 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %2933, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%float-2.400000e02_2354 = torch.constant.float -2.400000e+02 | |
%float2.400000e02_2355 = torch.constant.float 2.400000e+02 | |
%2934 = torch.aten.clamp %2933, %float-2.400000e02_2354, %float2.400000e02_2355 : !torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %2934, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int26_2356 = torch.constant.int 26 | |
%2935 = torch.prims.convert_element_type %2934, %int26_2356 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2935, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> | |
%int0_2357 = torch.constant.int 0 | |
%2936 = torch.aten.unsqueeze %153, %int0_2357 : !torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ> | |
%int4_2358 = torch.constant.int 4 | |
%int1024_2359 = torch.constant.int 1024 | |
%int4096_2360 = torch.constant.int 4096 | |
%2937 = torch.prim.ListConstruct %int4_2358, %int1024_2359, %int4096_2360 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%false_2361 = torch.constant.bool false | |
%2938 = torch.aten.expand %2936, %2937, %false_2361 : !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ> | |
%2939 = torch_c.to_builtin_tensor %2935 : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> -> tensor<4x?x4096xf8E4M3FNUZ> | |
%2940 = torch_c.to_builtin_tensor %2938 : !torch.vtensor<[4,1024,4096],f8E4M3FNUZ> -> tensor<4x1024x4096xf8E4M3FNUZ> | |
%2941 = util.call @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ_f32(%2939, %2940) : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x?x1024xf32> | |
%2942 = torch_c.from_builtin_tensor %2941 : tensor<4x?x1024xf32> -> !torch.vtensor<[4,?,1024],f32> | |
torch.bind_symbolic_shape %2942, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f32> | |
%2943 = torch.aten.div.Tensor %2942, %154 : !torch.vtensor<[4,?,1024],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,1024],f32> | |
torch.bind_symbolic_shape %2943, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f32> | |
%float-2.400000e02_2362 = torch.constant.float -2.400000e+02 | |
%float2.400000e02_2363 = torch.constant.float 2.400000e+02 | |
%2944 = torch.aten.clamp %2943, %float-2.400000e02_2362, %float2.400000e02_2363 : !torch.vtensor<[4,?,1024],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,1024],f32> | |
torch.bind_symbolic_shape %2944, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f32> | |
%int26_2364 = torch.constant.int 26 | |
%2945 = torch.prims.convert_element_type %2944, %int26_2364 : !torch.vtensor<[4,?,1024],f32>, !torch.int -> !torch.vtensor<[4,?,1024],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2945, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f8E4M3FNUZ> | |
%int4_2365 = torch.constant.int 4 | |
%int32_2366 = torch.constant.int 32 | |
%int128_2367 = torch.constant.int 128 | |
%2946 = torch.prim.ListConstruct %int4_2365, %777, %int32_2366, %int128_2367 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%2947 = torch.aten.view %2919, %2946 : !torch.vtensor<[4,?,4096],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2947, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ> | |
%int4_2368 = torch.constant.int 4 | |
%int8_2369 = torch.constant.int 8 | |
%int128_2370 = torch.constant.int 128 | |
%2948 = torch.prim.ListConstruct %int4_2368, %777, %int8_2369, %int128_2370 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%2949 = torch.aten.view %2932, %2948 : !torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2949, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 128)> : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ> | |
%int4_2371 = torch.constant.int 4 | |
%int8_2372 = torch.constant.int 8 | |
%int128_2373 = torch.constant.int 128 | |
%2950 = torch.prim.ListConstruct %int4_2371, %777, %int8_2372, %int128_2373 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%2951 = torch.aten.view %2945, %2950 : !torch.vtensor<[4,?,1024],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %2951, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 128)> : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ> | |
%int131072_2374 = torch.constant.int 131072 | |
%none_2375 = torch.constant.none | |
%none_2376 = torch.constant.none | |
%cpu_2377 = torch.constant.device "cpu" | |
%false_2378 = torch.constant.bool false | |
%2952 = torch.aten.arange %int131072_2374, %none_2375, %none_2376, %cpu_2377, %false_2378 : !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[131072],si64> | |
%int0_2379 = torch.constant.int 0 | |
%int128_2380 = torch.constant.int 128 | |
%int2_2381 = torch.constant.int 2 | |
%int4_2382 = torch.constant.int 4 | |
%none_2383 = torch.constant.none | |
%cpu_2384 = torch.constant.device "cpu" | |
%false_2385 = torch.constant.bool false | |
%2953 = torch.aten.arange.start_step %int0_2379, %int128_2380, %int2_2381, %int4_2382, %none_2383, %cpu_2384, %false_2385 : !torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[64],si64> | |
%int6_2386 = torch.constant.int 6 | |
%2954 = torch.prims.convert_element_type %2953, %int6_2386 : !torch.vtensor<[64],si64>, !torch.int -> !torch.vtensor<[64],f32> | |
%int128_2387 = torch.constant.int 128 | |
%2955 = torch.aten.div.Scalar %2954, %int128_2387 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32> | |
%float5.000000e05_2388 = torch.constant.float 5.000000e+05 | |
%2956 = torch.aten.pow.Scalar %float5.000000e05_2388, %2955 : !torch.float, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32> | |
%2957 = torch.aten.reciprocal %2956 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32> | |
%float1.000000e00_2389 = torch.constant.float 1.000000e+00 | |
%2958 = torch.aten.mul.Scalar %2957, %float1.000000e00_2389 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],f32> | |
%2959 = torch.aten.reciprocal %2958 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32> | |
%float6.283190e00_2390 = torch.constant.float 6.2831853071795862 | |
%2960 = torch.aten.mul.Scalar %2959, %float6.283190e00_2390 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],f32> | |
%float8.192000e03_2391 = torch.constant.float 8.192000e+03 | |
%2961 = torch.aten.gt.Scalar %2960, %float8.192000e03_2391 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1> | |
%int8_2392 = torch.constant.int 8 | |
%2962 = torch.aten.div.Scalar %2958, %int8_2392 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32> | |
%2963 = torch.aten.where.self %2961, %2962, %2958 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32> | |
%2964 = torch.aten.reciprocal %2960 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32> | |
%int8192_2393 = torch.constant.int 8192 | |
%2965 = torch.aten.mul.Scalar %2964, %int8192_2393 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32> | |
%int1_2394 = torch.constant.int 1 | |
%int1_2395 = torch.constant.int 1 | |
%2966 = torch.aten.sub.Scalar %2965, %int1_2394, %int1_2395 : !torch.vtensor<[64],f32>, !torch.int, !torch.int -> !torch.vtensor<[64],f32> | |
%int3_2396 = torch.constant.int 3 | |
%2967 = torch.aten.div.Scalar %2966, %int3_2396 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32> | |
%int1_2397 = torch.constant.int 1 | |
%int1_2398 = torch.constant.int 1 | |
%2968 = torch.aten.rsub.Scalar %2967, %int1_2397, %int1_2398 : !torch.vtensor<[64],f32>, !torch.int, !torch.int -> !torch.vtensor<[64],f32> | |
%2969 = torch.aten.mul.Tensor %2968, %2963 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32> | |
%int8_2399 = torch.constant.int 8 | |
%2970 = torch.aten.div.Scalar %2969, %int8_2399 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32> | |
%2971 = torch.aten.mul.Tensor %2967, %2963 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32> | |
%int1_2400 = torch.constant.int 1 | |
%2972 = torch.aten.add.Tensor %2970, %2971, %int1_2400 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32> | |
%float2.048000e03_2401 = torch.constant.float 2.048000e+03 | |
%2973 = torch.aten.lt.Scalar %2960, %float2.048000e03_2401 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1> | |
%2974 = torch.aten.bitwise_not %2973 : !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1> | |
%float8.192000e03_2402 = torch.constant.float 8.192000e+03 | |
%2975 = torch.aten.gt.Scalar %2960, %float8.192000e03_2402 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1> | |
%2976 = torch.aten.bitwise_not %2975 : !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1> | |
%2977 = torch.aten.mul.Tensor %2974, %2976 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1> | |
%2978 = torch.aten.where.self %2977, %2972, %2963 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32> | |
%2979 = torch.prim.ListConstruct %2978, %2978 : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor> | |
%int-1_2403 = torch.constant.int -1 | |
%2980 = torch.aten.cat %2979, %int-1_2403 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[128],f32> | |
%int6_2404 = torch.constant.int 6 | |
%2981 = torch.prims.convert_element_type %2980, %int6_2404 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[128],f32> | |
%int1_2405 = torch.constant.int 1 | |
%2982 = torch.aten.unsqueeze %2952, %int1_2405 : !torch.vtensor<[131072],si64>, !torch.int -> !torch.vtensor<[131072,1],si64> | |
%int6_2406 = torch.constant.int 6 | |
%2983 = torch.prims.convert_element_type %2982, %int6_2406 : !torch.vtensor<[131072,1],si64>, !torch.int -> !torch.vtensor<[131072,1],f32> | |
%int0_2407 = torch.constant.int 0 | |
%2984 = torch.aten.unsqueeze %2981, %int0_2407 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[1,128],f32> | |
%int6_2408 = torch.constant.int 6 | |
%2985 = torch.prims.convert_element_type %2984, %int6_2408 : !torch.vtensor<[1,128],f32>, !torch.int -> !torch.vtensor<[1,128],f32> | |
%2986 = torch.aten.mul.Tensor %2983, %2985 : !torch.vtensor<[131072,1],f32>, !torch.vtensor<[1,128],f32> -> !torch.vtensor<[131072,128],f32> | |
%2987 = torch.aten.cos %2986 : !torch.vtensor<[131072,128],f32> -> !torch.vtensor<[131072,128],f32> | |
%int15_2409 = torch.constant.int 15 | |
%2988 = torch.prims.convert_element_type %2987, %int15_2409 : !torch.vtensor<[131072,128],f32>, !torch.int -> !torch.vtensor<[131072,128],bf16> | |
%2989 = torch.aten.sin %2986 : !torch.vtensor<[131072,128],f32> -> !torch.vtensor<[131072,128],f32> | |
%int15_2410 = torch.constant.int 15 | |
%2990 = torch.prims.convert_element_type %2989, %int15_2410 : !torch.vtensor<[131072,128],f32>, !torch.int -> !torch.vtensor<[131072,128],bf16> | |
%int0_2411 = torch.constant.int 0 | |
%int0_2412 = torch.constant.int 0 | |
%int1_2413 = torch.constant.int 1 | |
%2991 = torch.aten.slice.Tensor %2988, %int0_2411, %int0_2412, %777, %int1_2413 : !torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16> | |
torch.bind_symbolic_shape %2991, [%773], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16> | |
%int1_2414 = torch.constant.int 1 | |
%int0_2415 = torch.constant.int 0 | |
%int9223372036854775807_2416 = torch.constant.int 9223372036854775807 | |
%int1_2417 = torch.constant.int 1 | |
%2992 = torch.aten.slice.Tensor %2991, %int1_2414, %int0_2415, %int9223372036854775807_2416, %int1_2417 : !torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16> | |
torch.bind_symbolic_shape %2992, [%773], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16> | |
%int0_2418 = torch.constant.int 0 | |
%int0_2419 = torch.constant.int 0 | |
%int1_2420 = torch.constant.int 1 | |
%2993 = torch.aten.slice.Tensor %2990, %int0_2418, %int0_2419, %777, %int1_2420 : !torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16> | |
torch.bind_symbolic_shape %2993, [%773], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16> | |
%int1_2421 = torch.constant.int 1 | |
%int0_2422 = torch.constant.int 0 | |
%int9223372036854775807_2423 = torch.constant.int 9223372036854775807 | |
%int1_2424 = torch.constant.int 1 | |
%2994 = torch.aten.slice.Tensor %2993, %int1_2421, %int0_2422, %int9223372036854775807_2423, %int1_2424 : !torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16> | |
torch.bind_symbolic_shape %2994, [%773], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16> | |
%int0_2425 = torch.constant.int 0 | |
%2995 = torch.aten.unsqueeze %2992, %int0_2425 : !torch.vtensor<[?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,128],bf16> | |
torch.bind_symbolic_shape %2995, [%773], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16> | |
%int1_2426 = torch.constant.int 1 | |
%int0_2427 = torch.constant.int 0 | |
%int9223372036854775807_2428 = torch.constant.int 9223372036854775807 | |
%int1_2429 = torch.constant.int 1 | |
%2996 = torch.aten.slice.Tensor %2995, %int1_2426, %int0_2427, %int9223372036854775807_2428, %int1_2429 : !torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],bf16> | |
torch.bind_symbolic_shape %2996, [%773], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16> | |
%int2_2430 = torch.constant.int 2 | |
%2997 = torch.aten.unsqueeze %2996, %int2_2430 : !torch.vtensor<[1,?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,1,128],bf16> | |
torch.bind_symbolic_shape %2997, [%773], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16> | |
%int3_2431 = torch.constant.int 3 | |
%int0_2432 = torch.constant.int 0 | |
%int9223372036854775807_2433 = torch.constant.int 9223372036854775807 | |
%int1_2434 = torch.constant.int 1 | |
%2998 = torch.aten.slice.Tensor %2997, %int3_2431, %int0_2432, %int9223372036854775807_2433, %int1_2434 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,1,128],bf16> | |
torch.bind_symbolic_shape %2998, [%773], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16> | |
%int4_2435 = torch.constant.int 4 | |
%int1_2436 = torch.constant.int 1 | |
%int1_2437 = torch.constant.int 1 | |
%int1_2438 = torch.constant.int 1 | |
%2999 = torch.prim.ListConstruct %int4_2435, %int1_2436, %int1_2437, %int1_2438 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%3000 = torch.aten.repeat %2998, %2999 : !torch.vtensor<[1,?,1,128],bf16>, !torch.list<int> -> !torch.vtensor<[4,?,1,128],bf16> | |
torch.bind_symbolic_shape %3000, [%773], affine_map<()[s0] -> (4, s0 * 32, 1, 128)> : !torch.vtensor<[4,?,1,128],bf16> | |
%int0_2439 = torch.constant.int 0 | |
%3001 = torch.aten.unsqueeze %2994, %int0_2439 : !torch.vtensor<[?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,128],bf16> | |
torch.bind_symbolic_shape %3001, [%773], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16> | |
%int1_2440 = torch.constant.int 1 | |
%int0_2441 = torch.constant.int 0 | |
%int9223372036854775807_2442 = torch.constant.int 9223372036854775807 | |
%int1_2443 = torch.constant.int 1 | |
%3002 = torch.aten.slice.Tensor %3001, %int1_2440, %int0_2441, %int9223372036854775807_2442, %int1_2443 : !torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],bf16> | |
torch.bind_symbolic_shape %3002, [%773], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16> | |
%int2_2444 = torch.constant.int 2 | |
%3003 = torch.aten.unsqueeze %3002, %int2_2444 : !torch.vtensor<[1,?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,1,128],bf16> | |
torch.bind_symbolic_shape %3003, [%773], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16> | |
%int3_2445 = torch.constant.int 3 | |
%int0_2446 = torch.constant.int 0 | |
%int9223372036854775807_2447 = torch.constant.int 9223372036854775807 | |
%int1_2448 = torch.constant.int 1 | |
%3004 = torch.aten.slice.Tensor %3003, %int3_2445, %int0_2446, %int9223372036854775807_2447, %int1_2448 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,1,128],bf16> | |
torch.bind_symbolic_shape %3004, [%773], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16> | |
%int4_2449 = torch.constant.int 4 | |
%int1_2450 = torch.constant.int 1 | |
%int1_2451 = torch.constant.int 1 | |
%int1_2452 = torch.constant.int 1 | |
%3005 = torch.prim.ListConstruct %int4_2449, %int1_2450, %int1_2451, %int1_2452 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%3006 = torch.aten.repeat %3004, %3005 : !torch.vtensor<[1,?,1,128],bf16>, !torch.list<int> -> !torch.vtensor<[4,?,1,128],bf16> | |
torch.bind_symbolic_shape %3006, [%773], affine_map<()[s0] -> (4, s0 * 32, 1, 128)> : !torch.vtensor<[4,?,1,128],bf16> | |
%3007 = torch.aten.mul.Tensor %2947, %3000 : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16> -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %3007, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ> | |
%int3_2453 = torch.constant.int 3 | |
%int0_2454 = torch.constant.int 0 | |
%int64_2455 = torch.constant.int 64 | |
%int1_2456 = torch.constant.int 1 | |
%3008 = torch.aten.slice.Tensor %2947, %int3_2453, %int0_2454, %int64_2455, %int1_2456 : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %3008, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 64)> : !torch.vtensor<[4,?,32,64],f8E4M3FNUZ> | |
%int3_2457 = torch.constant.int 3 | |
%int64_2458 = torch.constant.int 64 | |
%int9223372036854775807_2459 = torch.constant.int 9223372036854775807 | |
%int1_2460 = torch.constant.int 1 | |
%3009 = torch.aten.slice.Tensor %2947, %int3_2457, %int64_2458, %int9223372036854775807_2459, %int1_2460 : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %3009, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 64)> : !torch.vtensor<[4,?,32,64],f8E4M3FNUZ> | |
%3010 = torch.aten.neg %3009 : !torch.vtensor<[4,?,32,64],f8E4M3FNUZ> -> !torch.vtensor<[4,?,32,64],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %3010, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 64)> : !torch.vtensor<[4,?,32,64],f8E4M3FNUZ> | |
%3011 = torch.prim.ListConstruct %3010, %3008 : (!torch.vtensor<[4,?,32,64],f8E4M3FNUZ>, !torch.vtensor<[4,?,32,64],f8E4M3FNUZ>) -> !torch.list<vtensor> | |
%int-1_2461 = torch.constant.int -1 | |
%3012 = torch.aten.cat %3011, %int-1_2461 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %3012, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ> | |
%3013 = torch.aten.mul.Tensor %3012, %3006 : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16> -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %3013, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ> | |
%int1_2462 = torch.constant.int 1 | |
%3014 = torch.aten.add.Tensor %3007, %3013, %int1_2462 : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %3014, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ> | |
%int131072_2463 = torch.constant.int 131072 | |
%none_2464 = torch.constant.none | |
%none_2465 = torch.constant.none | |
%cpu_2466 = torch.constant.device "cpu" | |
%false_2467 = torch.constant.bool false | |
%3015 = torch.aten.arange %int131072_2463, %none_2464, %none_2465, %cpu_2466, %false_2467 : !torch.int, !torch.none, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[131072],si64> | |
%int0_2468 = torch.constant.int 0 | |
%int128_2469 = torch.constant.int 128 | |
%int2_2470 = torch.constant.int 2 | |
%int4_2471 = torch.constant.int 4 | |
%none_2472 = torch.constant.none | |
%cpu_2473 = torch.constant.device "cpu" | |
%false_2474 = torch.constant.bool false | |
%3016 = torch.aten.arange.start_step %int0_2468, %int128_2469, %int2_2470, %int4_2471, %none_2472, %cpu_2473, %false_2474 : !torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[64],si64> | |
%int6_2475 = torch.constant.int 6 | |
%3017 = torch.prims.convert_element_type %3016, %int6_2475 : !torch.vtensor<[64],si64>, !torch.int -> !torch.vtensor<[64],f32> | |
%int128_2476 = torch.constant.int 128 | |
%3018 = torch.aten.div.Scalar %3017, %int128_2476 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32> | |
%float5.000000e05_2477 = torch.constant.float 5.000000e+05 | |
%3019 = torch.aten.pow.Scalar %float5.000000e05_2477, %3018 : !torch.float, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32> | |
%3020 = torch.aten.reciprocal %3019 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32> | |
%float1.000000e00_2478 = torch.constant.float 1.000000e+00 | |
%3021 = torch.aten.mul.Scalar %3020, %float1.000000e00_2478 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],f32> | |
%3022 = torch.aten.reciprocal %3021 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32> | |
%float6.283190e00_2479 = torch.constant.float 6.2831853071795862 | |
%3023 = torch.aten.mul.Scalar %3022, %float6.283190e00_2479 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],f32> | |
%float8.192000e03_2480 = torch.constant.float 8.192000e+03 | |
%3024 = torch.aten.gt.Scalar %3023, %float8.192000e03_2480 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1> | |
%int8_2481 = torch.constant.int 8 | |
%3025 = torch.aten.div.Scalar %3021, %int8_2481 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32> | |
%3026 = torch.aten.where.self %3024, %3025, %3021 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32> | |
%3027 = torch.aten.reciprocal %3023 : !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32> | |
%int8192_2482 = torch.constant.int 8192 | |
%3028 = torch.aten.mul.Scalar %3027, %int8192_2482 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32> | |
%int1_2483 = torch.constant.int 1 | |
%int1_2484 = torch.constant.int 1 | |
%3029 = torch.aten.sub.Scalar %3028, %int1_2483, %int1_2484 : !torch.vtensor<[64],f32>, !torch.int, !torch.int -> !torch.vtensor<[64],f32> | |
%int3_2485 = torch.constant.int 3 | |
%3030 = torch.aten.div.Scalar %3029, %int3_2485 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32> | |
%int1_2486 = torch.constant.int 1 | |
%int1_2487 = torch.constant.int 1 | |
%3031 = torch.aten.rsub.Scalar %3030, %int1_2486, %int1_2487 : !torch.vtensor<[64],f32>, !torch.int, !torch.int -> !torch.vtensor<[64],f32> | |
%3032 = torch.aten.mul.Tensor %3031, %3026 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32> | |
%int8_2488 = torch.constant.int 8 | |
%3033 = torch.aten.div.Scalar %3032, %int8_2488 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32> | |
%3034 = torch.aten.mul.Tensor %3030, %3026 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32> | |
%int1_2489 = torch.constant.int 1 | |
%3035 = torch.aten.add.Tensor %3033, %3034, %int1_2489 : !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32> | |
%float2.048000e03_2490 = torch.constant.float 2.048000e+03 | |
%3036 = torch.aten.lt.Scalar %3023, %float2.048000e03_2490 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1> | |
%3037 = torch.aten.bitwise_not %3036 : !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1> | |
%float8.192000e03_2491 = torch.constant.float 8.192000e+03 | |
%3038 = torch.aten.gt.Scalar %3023, %float8.192000e03_2491 : !torch.vtensor<[64],f32>, !torch.float -> !torch.vtensor<[64],i1> | |
%3039 = torch.aten.bitwise_not %3038 : !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1> | |
%3040 = torch.aten.mul.Tensor %3037, %3039 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],i1> -> !torch.vtensor<[64],i1> | |
%3041 = torch.aten.where.self %3040, %3035, %3026 : !torch.vtensor<[64],i1>, !torch.vtensor<[64],f32>, !torch.vtensor<[64],f32> -> !torch.vtensor<[64],f32> | |
%3042 = torch.prim.ListConstruct %3041, %3041 : (!torch.vtensor<[64],f32>, !torch.vtensor<[64],f32>) -> !torch.list<vtensor> | |
%int-1_2492 = torch.constant.int -1 | |
%3043 = torch.aten.cat %3042, %int-1_2492 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[128],f32> | |
%int6_2493 = torch.constant.int 6 | |
%3044 = torch.prims.convert_element_type %3043, %int6_2493 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[128],f32> | |
%int1_2494 = torch.constant.int 1 | |
%3045 = torch.aten.unsqueeze %3015, %int1_2494 : !torch.vtensor<[131072],si64>, !torch.int -> !torch.vtensor<[131072,1],si64> | |
%int6_2495 = torch.constant.int 6 | |
%3046 = torch.prims.convert_element_type %3045, %int6_2495 : !torch.vtensor<[131072,1],si64>, !torch.int -> !torch.vtensor<[131072,1],f32> | |
%int0_2496 = torch.constant.int 0 | |
%3047 = torch.aten.unsqueeze %3044, %int0_2496 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[1,128],f32> | |
%int6_2497 = torch.constant.int 6 | |
%3048 = torch.prims.convert_element_type %3047, %int6_2497 : !torch.vtensor<[1,128],f32>, !torch.int -> !torch.vtensor<[1,128],f32> | |
%3049 = torch.aten.mul.Tensor %3046, %3048 : !torch.vtensor<[131072,1],f32>, !torch.vtensor<[1,128],f32> -> !torch.vtensor<[131072,128],f32> | |
%3050 = torch.aten.cos %3049 : !torch.vtensor<[131072,128],f32> -> !torch.vtensor<[131072,128],f32> | |
%int15_2498 = torch.constant.int 15 | |
%3051 = torch.prims.convert_element_type %3050, %int15_2498 : !torch.vtensor<[131072,128],f32>, !torch.int -> !torch.vtensor<[131072,128],bf16> | |
%3052 = torch.aten.sin %3049 : !torch.vtensor<[131072,128],f32> -> !torch.vtensor<[131072,128],f32> | |
%int15_2499 = torch.constant.int 15 | |
%3053 = torch.prims.convert_element_type %3052, %int15_2499 : !torch.vtensor<[131072,128],f32>, !torch.int -> !torch.vtensor<[131072,128],bf16> | |
%int0_2500 = torch.constant.int 0 | |
%int0_2501 = torch.constant.int 0 | |
%int1_2502 = torch.constant.int 1 | |
%3054 = torch.aten.slice.Tensor %3051, %int0_2500, %int0_2501, %777, %int1_2502 : !torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16> | |
torch.bind_symbolic_shape %3054, [%773], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16> | |
%int1_2503 = torch.constant.int 1 | |
%int0_2504 = torch.constant.int 0 | |
%int9223372036854775807_2505 = torch.constant.int 9223372036854775807 | |
%int1_2506 = torch.constant.int 1 | |
%3055 = torch.aten.slice.Tensor %3054, %int1_2503, %int0_2504, %int9223372036854775807_2505, %int1_2506 : !torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16> | |
torch.bind_symbolic_shape %3055, [%773], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16> | |
%int0_2507 = torch.constant.int 0 | |
%int0_2508 = torch.constant.int 0 | |
%int1_2509 = torch.constant.int 1 | |
%3056 = torch.aten.slice.Tensor %3053, %int0_2507, %int0_2508, %777, %int1_2509 : !torch.vtensor<[131072,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16> | |
torch.bind_symbolic_shape %3056, [%773], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16> | |
%int1_2510 = torch.constant.int 1 | |
%int0_2511 = torch.constant.int 0 | |
%int9223372036854775807_2512 = torch.constant.int 9223372036854775807 | |
%int1_2513 = torch.constant.int 1 | |
%3057 = torch.aten.slice.Tensor %3056, %int1_2510, %int0_2511, %int9223372036854775807_2512, %int1_2513 : !torch.vtensor<[?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,128],bf16> | |
torch.bind_symbolic_shape %3057, [%773], affine_map<()[s0] -> (s0 * 32, 128)> : !torch.vtensor<[?,128],bf16> | |
%int0_2514 = torch.constant.int 0 | |
%3058 = torch.aten.unsqueeze %3055, %int0_2514 : !torch.vtensor<[?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,128],bf16> | |
torch.bind_symbolic_shape %3058, [%773], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16> | |
%int1_2515 = torch.constant.int 1 | |
%int0_2516 = torch.constant.int 0 | |
%int9223372036854775807_2517 = torch.constant.int 9223372036854775807 | |
%int1_2518 = torch.constant.int 1 | |
%3059 = torch.aten.slice.Tensor %3058, %int1_2515, %int0_2516, %int9223372036854775807_2517, %int1_2518 : !torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],bf16> | |
torch.bind_symbolic_shape %3059, [%773], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16> | |
%int2_2519 = torch.constant.int 2 | |
%3060 = torch.aten.unsqueeze %3059, %int2_2519 : !torch.vtensor<[1,?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,1,128],bf16> | |
torch.bind_symbolic_shape %3060, [%773], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16> | |
%int3_2520 = torch.constant.int 3 | |
%int0_2521 = torch.constant.int 0 | |
%int9223372036854775807_2522 = torch.constant.int 9223372036854775807 | |
%int1_2523 = torch.constant.int 1 | |
%3061 = torch.aten.slice.Tensor %3060, %int3_2520, %int0_2521, %int9223372036854775807_2522, %int1_2523 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,1,128],bf16> | |
torch.bind_symbolic_shape %3061, [%773], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16> | |
%int4_2524 = torch.constant.int 4 | |
%int1_2525 = torch.constant.int 1 | |
%int1_2526 = torch.constant.int 1 | |
%int1_2527 = torch.constant.int 1 | |
%3062 = torch.prim.ListConstruct %int4_2524, %int1_2525, %int1_2526, %int1_2527 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%3063 = torch.aten.repeat %3061, %3062 : !torch.vtensor<[1,?,1,128],bf16>, !torch.list<int> -> !torch.vtensor<[4,?,1,128],bf16> | |
torch.bind_symbolic_shape %3063, [%773], affine_map<()[s0] -> (4, s0 * 32, 1, 128)> : !torch.vtensor<[4,?,1,128],bf16> | |
%int0_2528 = torch.constant.int 0 | |
%3064 = torch.aten.unsqueeze %3057, %int0_2528 : !torch.vtensor<[?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,128],bf16> | |
torch.bind_symbolic_shape %3064, [%773], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16> | |
%int1_2529 = torch.constant.int 1 | |
%int0_2530 = torch.constant.int 0 | |
%int9223372036854775807_2531 = torch.constant.int 9223372036854775807 | |
%int1_2532 = torch.constant.int 1 | |
%3065 = torch.aten.slice.Tensor %3064, %int1_2529, %int0_2530, %int9223372036854775807_2531, %int1_2532 : !torch.vtensor<[1,?,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,128],bf16> | |
torch.bind_symbolic_shape %3065, [%773], affine_map<()[s0] -> (1, s0 * 32, 128)> : !torch.vtensor<[1,?,128],bf16> | |
%int2_2533 = torch.constant.int 2 | |
%3066 = torch.aten.unsqueeze %3065, %int2_2533 : !torch.vtensor<[1,?,128],bf16>, !torch.int -> !torch.vtensor<[1,?,1,128],bf16> | |
torch.bind_symbolic_shape %3066, [%773], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16> | |
%int3_2534 = torch.constant.int 3 | |
%int0_2535 = torch.constant.int 0 | |
%int9223372036854775807_2536 = torch.constant.int 9223372036854775807 | |
%int1_2537 = torch.constant.int 1 | |
%3067 = torch.aten.slice.Tensor %3066, %int3_2534, %int0_2535, %int9223372036854775807_2536, %int1_2537 : !torch.vtensor<[1,?,1,128],bf16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,?,1,128],bf16> | |
torch.bind_symbolic_shape %3067, [%773], affine_map<()[s0] -> (1, s0 * 32, 1, 128)> : !torch.vtensor<[1,?,1,128],bf16> | |
%int4_2538 = torch.constant.int 4 | |
%int1_2539 = torch.constant.int 1 | |
%int1_2540 = torch.constant.int 1 | |
%int1_2541 = torch.constant.int 1 | |
%3068 = torch.prim.ListConstruct %int4_2538, %int1_2539, %int1_2540, %int1_2541 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%3069 = torch.aten.repeat %3067, %3068 : !torch.vtensor<[1,?,1,128],bf16>, !torch.list<int> -> !torch.vtensor<[4,?,1,128],bf16> | |
torch.bind_symbolic_shape %3069, [%773], affine_map<()[s0] -> (4, s0 * 32, 1, 128)> : !torch.vtensor<[4,?,1,128],bf16> | |
%3070 = torch.aten.mul.Tensor %2949, %3063 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16> -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %3070, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 128)> : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ> | |
%int3_2542 = torch.constant.int 3 | |
%int0_2543 = torch.constant.int 0 | |
%int64_2544 = torch.constant.int 64 | |
%int1_2545 = torch.constant.int 1 | |
%3071 = torch.aten.slice.Tensor %2949, %int3_2542, %int0_2543, %int64_2544, %int1_2545 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %3071, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 64)> : !torch.vtensor<[4,?,8,64],f8E4M3FNUZ> | |
%int3_2546 = torch.constant.int 3 | |
%int64_2547 = torch.constant.int 64 | |
%int9223372036854775807_2548 = torch.constant.int 9223372036854775807 | |
%int1_2549 = torch.constant.int 1 | |
%3072 = torch.aten.slice.Tensor %2949, %int3_2546, %int64_2547, %int9223372036854775807_2548, %int1_2549 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %3072, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 64)> : !torch.vtensor<[4,?,8,64],f8E4M3FNUZ> | |
%3073 = torch.aten.neg %3072 : !torch.vtensor<[4,?,8,64],f8E4M3FNUZ> -> !torch.vtensor<[4,?,8,64],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %3073, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 64)> : !torch.vtensor<[4,?,8,64],f8E4M3FNUZ> | |
%3074 = torch.prim.ListConstruct %3073, %3071 : (!torch.vtensor<[4,?,8,64],f8E4M3FNUZ>, !torch.vtensor<[4,?,8,64],f8E4M3FNUZ>) -> !torch.list<vtensor> | |
%int-1_2550 = torch.constant.int -1 | |
%3075 = torch.aten.cat %3074, %int-1_2550 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %3075, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 128)> : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ> | |
%3076 = torch.aten.mul.Tensor %3075, %3069 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,1,128],bf16> -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %3076, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 128)> : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ> | |
%int1_2551 = torch.constant.int 1 | |
%3077 = torch.aten.add.Tensor %3070, %3076, %int1_2551 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,?,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %3077, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 128)> : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ> | |
%int64_2552 = torch.constant.int 64 | |
%3078 = torch.aten.mul.Scalar %arg2, %int64_2552 : !torch.vtensor<[4,?],si64>, !torch.int -> !torch.vtensor<[4,?],si64> | |
torch.bind_symbolic_shape %3078, [%773], affine_map<()[s0] -> (4, s0)> : !torch.vtensor<[4,?],si64> | |
%int12 = torch.constant.int 12 | |
%int1_2553 = torch.constant.int 1 | |
%3079 = torch.aten.add.Scalar %3078, %int12, %int1_2553 : !torch.vtensor<[4,?],si64>, !torch.int, !torch.int -> !torch.vtensor<[4,?],si64> | |
torch.bind_symbolic_shape %3079, [%773], affine_map<()[s0] -> (4, s0)> : !torch.vtensor<[4,?],si64> | |
%int4_2554 = torch.constant.int 4 | |
%int32_2555 = torch.constant.int 32 | |
%int8_2556 = torch.constant.int 8 | |
%int128_2557 = torch.constant.int 128 | |
%3080 = torch.prim.ListConstruct %int4_2554, %775, %int32_2555, %int8_2556, %int128_2557 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%3081 = torch.aten.view %3077, %3080 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %3081, [%773], affine_map<()[s0] -> (4, s0, 32, 8, 128)> : !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ> | |
%int32_2558 = torch.constant.int 32 | |
%int8_2559 = torch.constant.int 8 | |
%int128_2560 = torch.constant.int 128 | |
%3082 = torch.prim.ListConstruct %997, %int32_2558, %int8_2559, %int128_2560 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%3083 = torch.aten.view %3081, %3082 : !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %3083, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
%3084 = torch.prim.ListConstruct %997 : (!torch.int) -> !torch.list<int> | |
%3085 = torch.aten.view %3079, %3084 : !torch.vtensor<[4,?],si64>, !torch.list<int> -> !torch.vtensor<[?],si64> | |
torch.bind_symbolic_shape %3085, [%773], affine_map<()[s0] -> (s0 * 4)> : !torch.vtensor<[?],si64> | |
%int26_2561 = torch.constant.int 26 | |
%3086 = torch.prims.convert_element_type %3083, %int26_2561 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %3086, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
%int1_2562 = torch.constant.int 1 | |
%3087 = torch.aten.view.dtype %3086, %int1_2562 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[?,32,8,128],si8> | |
torch.bind_symbolic_shape %3087, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8> | |
%3088 = torch.aten.detach %3087 : !torch.vtensor<[?,32,8,128],si8> -> !torch.vtensor<[?,32,8,128],si8> | |
torch.bind_symbolic_shape %3088, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8> | |
%3089 = torch.aten.detach %3088 : !torch.vtensor<[?,32,8,128],si8> -> !torch.vtensor<[?,32,8,128],si8> | |
torch.bind_symbolic_shape %3089, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8> | |
%int32_2563 = torch.constant.int 32 | |
%int2_2564 = torch.constant.int 2 | |
%int32_2565 = torch.constant.int 32 | |
%int8_2566 = torch.constant.int 8 | |
%int128_2567 = torch.constant.int 128 | |
%3090 = torch.prim.ListConstruct %776, %int32_2563, %int2_2564, %int32_2565, %int8_2566, %int128_2567 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%3091 = torch.aten.view %2784, %3090 : !torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %3091, [%774], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ> | |
%int32_2568 = torch.constant.int 32 | |
%int8_2569 = torch.constant.int 8 | |
%int128_2570 = torch.constant.int 128 | |
%3092 = torch.prim.ListConstruct %990, %int32_2568, %int8_2569, %int128_2570 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%3093 = torch.aten.view %3091, %3092 : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %3093, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
%int1_2571 = torch.constant.int 1 | |
%3094 = torch.aten.view.dtype %3093, %int1_2571 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[?,32,8,128],si8> | |
torch.bind_symbolic_shape %3094, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8> | |
%3095 = torch.aten.detach %3094 : !torch.vtensor<[?,32,8,128],si8> -> !torch.vtensor<[?,32,8,128],si8> | |
torch.bind_symbolic_shape %3095, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8> | |
%3096 = torch.aten.detach %3095 : !torch.vtensor<[?,32,8,128],si8> -> !torch.vtensor<[?,32,8,128],si8> | |
torch.bind_symbolic_shape %3096, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8> | |
%3097 = torch.prim.ListConstruct %3085 : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>> | |
%false_2572 = torch.constant.bool false | |
%3098 = torch.aten.index_put %3096, %3097, %3089, %false_2572 : !torch.vtensor<[?,32,8,128],si8>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],si8>, !torch.bool -> !torch.vtensor<[?,32,8,128],si8> | |
torch.bind_symbolic_shape %3098, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8> | |
%int26_2573 = torch.constant.int 26 | |
%3099 = torch.aten.view.dtype %3098, %int26_2573 : !torch.vtensor<[?,32,8,128],si8>, !torch.int -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %3099, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
%3100 = torch.aten.detach %3099 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %3100, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
%3101 = torch.aten.detach %3100 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %3101, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
%int32_2574 = torch.constant.int 32 | |
%int2_2575 = torch.constant.int 2 | |
%int32_2576 = torch.constant.int 32 | |
%int8_2577 = torch.constant.int 8 | |
%int128_2578 = torch.constant.int 128 | |
%3102 = torch.prim.ListConstruct %776, %int32_2574, %int2_2575, %int32_2576, %int8_2577, %int128_2578 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%3103 = torch.aten.view %3101, %3102 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %3103, [%774], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ> | |
%int2097152_2579 = torch.constant.int 2097152 | |
%3104 = torch.prim.ListConstruct %776, %int2097152_2579 : (!torch.int, !torch.int) -> !torch.list<int> | |
%3105 = torch.aten.view %3103, %3104 : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,2097152],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %3105, [%774], affine_map<()[s0] -> (s0, 2097152)> : !torch.vtensor<[?,2097152],f8E4M3FNUZ> | |
%int4_2580 = torch.constant.int 4 | |
%int32_2581 = torch.constant.int 32 | |
%int8_2582 = torch.constant.int 8 | |
%int128_2583 = torch.constant.int 128 | |
%3106 = torch.prim.ListConstruct %int4_2580, %775, %int32_2581, %int8_2582, %int128_2583 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%3107 = torch.aten.view %2951, %3106 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %3107, [%773], affine_map<()[s0] -> (4, s0, 32, 8, 128)> : !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ> | |
%int32_2584 = torch.constant.int 32 | |
%int8_2585 = torch.constant.int 8 | |
%int128_2586 = torch.constant.int 128 | |
%3108 = torch.prim.ListConstruct %997, %int32_2584, %int8_2585, %int128_2586 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%3109 = torch.aten.view %3107, %3108 : !torch.vtensor<[4,?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %3109, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
%int1_2587 = torch.constant.int 1 | |
%int1_2588 = torch.constant.int 1 | |
%3110 = torch.aten.add.Scalar %3079, %int1_2587, %int1_2588 : !torch.vtensor<[4,?],si64>, !torch.int, !torch.int -> !torch.vtensor<[4,?],si64> | |
torch.bind_symbolic_shape %3110, [%773], affine_map<()[s0] -> (4, s0)> : !torch.vtensor<[4,?],si64> | |
%3111 = torch.prim.ListConstruct %997 : (!torch.int) -> !torch.list<int> | |
%3112 = torch.aten.view %3110, %3111 : !torch.vtensor<[4,?],si64>, !torch.list<int> -> !torch.vtensor<[?],si64> | |
torch.bind_symbolic_shape %3112, [%773], affine_map<()[s0] -> (s0 * 4)> : !torch.vtensor<[?],si64> | |
%int26_2589 = torch.constant.int 26 | |
%3113 = torch.prims.convert_element_type %3109, %int26_2589 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %3113, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
%int1_2590 = torch.constant.int 1 | |
%3114 = torch.aten.view.dtype %3113, %int1_2590 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[?,32,8,128],si8> | |
torch.bind_symbolic_shape %3114, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8> | |
%3115 = torch.aten.detach %3114 : !torch.vtensor<[?,32,8,128],si8> -> !torch.vtensor<[?,32,8,128],si8> | |
torch.bind_symbolic_shape %3115, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8> | |
%3116 = torch.aten.detach %3115 : !torch.vtensor<[?,32,8,128],si8> -> !torch.vtensor<[?,32,8,128],si8> | |
torch.bind_symbolic_shape %3116, [%773], affine_map<()[s0] -> (s0 * 4, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8> | |
%int32_2591 = torch.constant.int 32 | |
%int2_2592 = torch.constant.int 2 | |
%int32_2593 = torch.constant.int 32 | |
%int8_2594 = torch.constant.int 8 | |
%int128_2595 = torch.constant.int 128 | |
%3117 = torch.prim.ListConstruct %776, %int32_2591, %int2_2592, %int32_2593, %int8_2594, %int128_2595 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%3118 = torch.aten.view %3105, %3117 : !torch.vtensor<[?,2097152],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %3118, [%774], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ> | |
%int32_2596 = torch.constant.int 32 | |
%int8_2597 = torch.constant.int 8 | |
%int128_2598 = torch.constant.int 128 | |
%3119 = torch.prim.ListConstruct %990, %int32_2596, %int8_2597, %int128_2598 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%3120 = torch.aten.view %3118, %3119 : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %3120, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
%int1_2599 = torch.constant.int 1 | |
%3121 = torch.aten.view.dtype %3120, %int1_2599 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[?,32,8,128],si8> | |
torch.bind_symbolic_shape %3121, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8> | |
%3122 = torch.aten.detach %3121 : !torch.vtensor<[?,32,8,128],si8> -> !torch.vtensor<[?,32,8,128],si8> | |
torch.bind_symbolic_shape %3122, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8> | |
%3123 = torch.aten.detach %3122 : !torch.vtensor<[?,32,8,128],si8> -> !torch.vtensor<[?,32,8,128],si8> | |
torch.bind_symbolic_shape %3123, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8> | |
%3124 = torch.prim.ListConstruct %3112 : (!torch.vtensor<[?],si64>) -> !torch.list<optional<vtensor>> | |
%false_2600 = torch.constant.bool false | |
%3125 = torch.aten.index_put %3123, %3124, %3116, %false_2600 : !torch.vtensor<[?,32,8,128],si8>, !torch.list<optional<vtensor>>, !torch.vtensor<[?,32,8,128],si8>, !torch.bool -> !torch.vtensor<[?,32,8,128],si8> | |
torch.bind_symbolic_shape %3125, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],si8> | |
%int26_2601 = torch.constant.int 26 | |
%3126 = torch.aten.view.dtype %3125, %int26_2601 : !torch.vtensor<[?,32,8,128],si8>, !torch.int -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %3126, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
%3127 = torch.aten.detach %3126 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %3127, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
%3128 = torch.aten.detach %3127 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> -> !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %3128, [%774], affine_map<()[s0] -> (s0 * 64, 32, 8, 128)> : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ> | |
%int32_2602 = torch.constant.int 32 | |
%int2_2603 = torch.constant.int 2 | |
%int32_2604 = torch.constant.int 32 | |
%int8_2605 = torch.constant.int 8 | |
%int128_2606 = torch.constant.int 128 | |
%3129 = torch.prim.ListConstruct %776, %int32_2602, %int2_2603, %int32_2604, %int8_2605, %int128_2606 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%3130 = torch.aten.view %3128, %3129 : !torch.vtensor<[?,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %3130, [%774], affine_map<()[s0] -> (s0, 32, 2, 32, 8, 128)> : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ> | |
%int2097152_2607 = torch.constant.int 2097152 | |
%3131 = torch.prim.ListConstruct %776, %int2097152_2607 : (!torch.int, !torch.int) -> !torch.list<int> | |
%3132 = torch.aten.view %3130, %3131 : !torch.vtensor<[?,32,2,32,8,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[?,2097152],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %3132, [%774], affine_map<()[s0] -> (s0, 2097152)> : !torch.vtensor<[?,2097152],f8E4M3FNUZ> | |
%int-2_2608 = torch.constant.int -2 | |
%3133 = torch.aten.unsqueeze %3077, %int-2_2608 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %3133, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 1, 128)> : !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ> | |
%int4_2609 = torch.constant.int 4 | |
%int8_2610 = torch.constant.int 8 | |
%int4_2611 = torch.constant.int 4 | |
%int128_2612 = torch.constant.int 128 | |
%3134 = torch.prim.ListConstruct %int4_2609, %777, %int8_2610, %int4_2611, %int128_2612 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%false_2613 = torch.constant.bool false | |
%3135 = torch.aten.expand %3133, %3134, %false_2613 : !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %3135, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 4, 128)> : !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ> | |
%int0_2614 = torch.constant.int 0 | |
%3136 = torch.aten.clone %3135, %int0_2614 : !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %3136, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 4, 128)> : !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ> | |
%int4_2615 = torch.constant.int 4 | |
%int32_2616 = torch.constant.int 32 | |
%int128_2617 = torch.constant.int 128 | |
%3137 = torch.prim.ListConstruct %int4_2615, %777, %int32_2616, %int128_2617 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%3138 = torch.aten._unsafe_view %3136, %3137 : !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %3138, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ> | |
%int-2_2618 = torch.constant.int -2 | |
%3139 = torch.aten.unsqueeze %2951, %int-2_2618 : !torch.vtensor<[4,?,8,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %3139, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 1, 128)> : !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ> | |
%int4_2619 = torch.constant.int 4 | |
%int8_2620 = torch.constant.int 8 | |
%int4_2621 = torch.constant.int 4 | |
%int128_2622 = torch.constant.int 128 | |
%3140 = torch.prim.ListConstruct %int4_2619, %777, %int8_2620, %int4_2621, %int128_2622 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%false_2623 = torch.constant.bool false | |
%3141 = torch.aten.expand %3139, %3140, %false_2623 : !torch.vtensor<[4,?,8,1,128],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %3141, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 4, 128)> : !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ> | |
%int0_2624 = torch.constant.int 0 | |
%3142 = torch.aten.clone %3141, %int0_2624 : !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %3142, [%773], affine_map<()[s0] -> (4, s0 * 32, 8, 4, 128)> : !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ> | |
%int4_2625 = torch.constant.int 4 | |
%int32_2626 = torch.constant.int 32 | |
%int128_2627 = torch.constant.int 128 | |
%3143 = torch.prim.ListConstruct %int4_2625, %777, %int32_2626, %int128_2627 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%3144 = torch.aten._unsafe_view %3142, %3143 : !torch.vtensor<[4,?,8,4,128],f8E4M3FNUZ>, !torch.list<int> -> !torch.vtensor<[4,?,32,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %3144, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ> | |
%int1_2628 = torch.constant.int 1 | |
%int2_2629 = torch.constant.int 2 | |
%3145 = torch.aten.transpose.int %3014, %int1_2628, %int2_2629 : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %3145, [%773], affine_map<()[s0] -> (4, 32, s0 * 32, 128)> : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> | |
%int1_2630 = torch.constant.int 1 | |
%int2_2631 = torch.constant.int 2 | |
%3146 = torch.aten.transpose.int %3138, %int1_2630, %int2_2631 : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %3146, [%773], affine_map<()[s0] -> (4, 32, s0 * 32, 128)> : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> | |
%int1_2632 = torch.constant.int 1 | |
%int2_2633 = torch.constant.int 2 | |
%3147 = torch.aten.transpose.int %3144, %int1_2632, %int2_2633 : !torch.vtensor<[4,?,32,128],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %3147, [%773], affine_map<()[s0] -> (4, 32, s0 * 32, 128)> : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> | |
%int26_2634 = torch.constant.int 26 | |
%3148 = torch.prims.convert_element_type %3145, %int26_2634 : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %3148, [%773], affine_map<()[s0] -> (4, 32, s0 * 32, 128)> : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> | |
%int26_2635 = torch.constant.int 26 | |
%3149 = torch.prims.convert_element_type %3146, %int26_2635 : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %3149, [%773], affine_map<()[s0] -> (4, 32, s0 * 32, 128)> : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> | |
%int26_2636 = torch.constant.int 26 | |
%3150 = torch.prims.convert_element_type %3147, %int26_2636 : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %3150, [%773], affine_map<()[s0] -> (4, 32, s0 * 32, 128)> : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> | |
%int26_2637 = torch.constant.int 26 | |
%3151 = torch.prims.convert_element_type %803, %int26_2637 : !torch.vtensor<[4,1,?,?],f32>, !torch.int -> !torch.vtensor<[4,1,?,?],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %3151, [%773], affine_map<()[s0] -> (4, 1, s0 * 32, s0 * 32)> : !torch.vtensor<[4,1,?,?],f8E4M3FNUZ> | |
%int0_2638 = torch.constant.int 0 | |
%int0_2639 = torch.constant.int 0 | |
%3152 = torch.aten.select.int %3151, %int0_2638, %int0_2639 : !torch.vtensor<[4,1,?,?],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[1,?,?],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %3152, [%773], affine_map<()[s0] -> (1, s0 * 32, s0 * 32)> : !torch.vtensor<[1,?,?],f8E4M3FNUZ> | |
%int0_2640 = torch.constant.int 0 | |
%int0_2641 = torch.constant.int 0 | |
%3153 = torch.aten.select.int %3152, %int0_2640, %int0_2641 : !torch.vtensor<[1,?,?],f8E4M3FNUZ>, !torch.int, !torch.int -> !torch.vtensor<[?,?],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %3153, [%773], affine_map<()[s0] -> (s0 * 32, s0 * 32)> : !torch.vtensor<[?,?],f8E4M3FNUZ> | |
%int0_2642 = torch.constant.int 0 | |
%int0_2643 = torch.constant.int 0 | |
%int9223372036854775807_2644 = torch.constant.int 9223372036854775807 | |
%int1_2645 = torch.constant.int 1 | |
%3154 = torch.aten.slice.Tensor %3153, %int0_2642, %int0_2643, %int9223372036854775807_2644, %int1_2645 : !torch.vtensor<[?,?],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,?],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %3154, [%773], affine_map<()[s0] -> (s0 * 32, s0 * 32)> : !torch.vtensor<[?,?],f8E4M3FNUZ> | |
%int1_2646 = torch.constant.int 1 | |
%int0_2647 = torch.constant.int 0 | |
%int9223372036854775807_2648 = torch.constant.int 9223372036854775807 | |
%int1_2649 = torch.constant.int 1 | |
%3155 = torch.aten.slice.Tensor %3154, %int1_2646, %int0_2647, %int9223372036854775807_2648, %int1_2649 : !torch.vtensor<[?,?],f8E4M3FNUZ>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[?,?],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %3155, [%773], affine_map<()[s0] -> (s0 * 32, s0 * 32)> : !torch.vtensor<[?,?],f8E4M3FNUZ> | |
%none_2650 = torch.constant.none | |
%3156 = torch.aten.clone %155, %none_2650 : !torch.vtensor<[],f32>, !torch.none -> !torch.vtensor<[],f32> | |
%3157 = torch.aten.detach %3156 : !torch.vtensor<[],f32> -> !torch.vtensor<[],f32> | |
%3158 = torch.aten.detach %3157 : !torch.vtensor<[],f32> -> !torch.vtensor<[],f32> | |
%3159 = torch.aten.detach %3158 : !torch.vtensor<[],f32> -> !torch.vtensor<[],f32> | |
%3160 = torch_c.to_builtin_tensor %3148 : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> -> tensor<4x32x?x128xf8E4M3FNUZ> | |
%3161 = torch_c.to_builtin_tensor %3149 : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> -> tensor<4x32x?x128xf8E4M3FNUZ> | |
%3162 = torch_c.to_builtin_tensor %3150 : !torch.vtensor<[4,32,?,128],f8E4M3FNUZ> -> tensor<4x32x?x128xf8E4M3FNUZ> | |
%3163 = torch_c.to_builtin_tensor %3155 : !torch.vtensor<[?,?],f8E4M3FNUZ> -> tensor<?x?xf8E4M3FNUZ> | |
%3164 = torch_c.to_builtin_tensor %3159 : !torch.vtensor<[],f32> -> tensor<f32> | |
%3165 = util.call @sharktank_masked_flash_attention_4_32_128_128_f8E4M3FNUZ_f8E4M3FNUZ_f32_f32(%3160, %3161, %3162, %3164, %3163) : (tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<4x32x?x128xf8E4M3FNUZ>, tensor<f32>, tensor<?x?xf8E4M3FNUZ>) -> tensor<4x32x?x128xf32> | |
%3166 = torch_c.from_builtin_tensor %3165 : tensor<4x32x?x128xf32> -> !torch.vtensor<[4,32,?,128],f32> | |
torch.bind_symbolic_shape %3166, [%773], affine_map<()[s0] -> (4, 32, s0 * 32, 128)> : !torch.vtensor<[4,32,?,128],f32> | |
%int1_2651 = torch.constant.int 1 | |
%int2_2652 = torch.constant.int 2 | |
%3167 = torch.aten.transpose.int %3166, %int1_2651, %int2_2652 : !torch.vtensor<[4,32,?,128],f32>, !torch.int, !torch.int -> !torch.vtensor<[4,?,32,128],f32> | |
torch.bind_symbolic_shape %3167, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f32> | |
%int0_2653 = torch.constant.int 0 | |
%3168 = torch.aten.clone %3167, %int0_2653 : !torch.vtensor<[4,?,32,128],f32>, !torch.int -> !torch.vtensor<[4,?,32,128],f32> | |
torch.bind_symbolic_shape %3168, [%773], affine_map<()[s0] -> (4, s0 * 32, 32, 128)> : !torch.vtensor<[4,?,32,128],f32> | |
%int4_2654 = torch.constant.int 4 | |
%int4096_2655 = torch.constant.int 4096 | |
%3169 = torch.prim.ListConstruct %int4_2654, %777, %int4096_2655 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%3170 = torch.aten._unsafe_view %3168, %3169 : !torch.vtensor<[4,?,32,128],f32>, !torch.list<int> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %3170, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%3171 = torch.aten.div.Tensor %3170, %156 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %3171, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%float-2.400000e02_2656 = torch.constant.float -2.400000e+02 | |
%float2.400000e02_2657 = torch.constant.float 2.400000e+02 | |
%3172 = torch.aten.clamp %3171, %float-2.400000e02_2656, %float2.400000e02_2657 : !torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %3172, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int26_2658 = torch.constant.int 26 | |
%3173 = torch.prims.convert_element_type %3172, %int26_2658 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %3173, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> | |
%int0_2659 = torch.constant.int 0 | |
%3174 = torch.aten.unsqueeze %157, %int0_2659 : !torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ> | |
%int4_2660 = torch.constant.int 4 | |
%int4096_2661 = torch.constant.int 4096 | |
%int4096_2662 = torch.constant.int 4096 | |
%3175 = torch.prim.ListConstruct %int4_2660, %int4096_2661, %int4096_2662 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%false_2663 = torch.constant.bool false | |
%3176 = torch.aten.expand %3174, %3175, %false_2663 : !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ> | |
%3177 = torch_c.to_builtin_tensor %3173 : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> -> tensor<4x?x4096xf8E4M3FNUZ> | |
%3178 = torch_c.to_builtin_tensor %3176 : !torch.vtensor<[4,4096,4096],f8E4M3FNUZ> -> tensor<4x4096x4096xf8E4M3FNUZ> | |
%3179 = util.call @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ_f32(%3177, %3178) : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x?x4096xf32> | |
%3180 = torch_c.from_builtin_tensor %3179 : tensor<4x?x4096xf32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %3180, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%3181 = torch.prim.ListConstruct : () -> !torch.list<int> | |
%3182 = torch.aten.permute %158, %3181 : !torch.vtensor<[],f32>, !torch.list<int> -> !torch.vtensor<[],f32> | |
%3183 = torch.aten.mul.Tensor %156, %3182 : !torch.vtensor<[],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[],f32> | |
%int6_2664 = torch.constant.int 6 | |
%3184 = torch.prims.convert_element_type %3180, %int6_2664 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %3184, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%3185 = torch.aten.mul.Tensor %3184, %3183 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %3185, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int1_2665 = torch.constant.int 1 | |
%3186 = torch.aten.add.Tensor %2896, %3185, %int1_2665 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %3186, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int6_2666 = torch.constant.int 6 | |
%3187 = torch.prims.convert_element_type %3186, %int6_2666 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %3187, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int2_2667 = torch.constant.int 2 | |
%3188 = torch.aten.pow.Tensor_Scalar %3187, %int2_2667 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %3188, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int-1_2668 = torch.constant.int -1 | |
%3189 = torch.prim.ListConstruct %int-1_2668 : (!torch.int) -> !torch.list<int> | |
%true_2669 = torch.constant.bool true | |
%none_2670 = torch.constant.none | |
%3190 = torch.aten.mean.dim %3188, %3189, %true_2669, %none_2670 : !torch.vtensor<[4,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[4,?,1],f32> | |
torch.bind_symbolic_shape %3190, [%773], affine_map<()[s0] -> (4, s0 * 32, 1)> : !torch.vtensor<[4,?,1],f32> | |
%float1.000000e-05_2671 = torch.constant.float 1.000000e-05 | |
%int1_2672 = torch.constant.int 1 | |
%3191 = torch.aten.add.Scalar %3190, %float1.000000e-05_2671, %int1_2672 : !torch.vtensor<[4,?,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[4,?,1],f32> | |
torch.bind_symbolic_shape %3191, [%773], affine_map<()[s0] -> (4, s0 * 32, 1)> : !torch.vtensor<[4,?,1],f32> | |
%3192 = torch.aten.rsqrt %3191 : !torch.vtensor<[4,?,1],f32> -> !torch.vtensor<[4,?,1],f32> | |
torch.bind_symbolic_shape %3192, [%773], affine_map<()[s0] -> (4, s0 * 32, 1)> : !torch.vtensor<[4,?,1],f32> | |
%3193 = torch.aten.mul.Tensor %3187, %3192 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,1],f32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %3193, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int6_2673 = torch.constant.int 6 | |
%3194 = torch.prims.convert_element_type %3193, %int6_2673 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %3194, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%3195 = torch.aten.mul.Tensor %159, %3194 : !torch.vtensor<[4096],bf16>, !torch.vtensor<[4,?,4096],f32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %3195, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int6_2674 = torch.constant.int 6 | |
%3196 = torch.prims.convert_element_type %3195, %int6_2674 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %3196, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%3197 = torch.aten.div.Tensor %3196, %160 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %3197, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%float-2.400000e02_2675 = torch.constant.float -2.400000e+02 | |
%float2.400000e02_2676 = torch.constant.float 2.400000e+02 | |
%3198 = torch.aten.clamp %3197, %float-2.400000e02_2675, %float2.400000e02_2676 : !torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %3198, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int26_2677 = torch.constant.int 26 | |
%3199 = torch.prims.convert_element_type %3198, %int26_2677 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %3199, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> | |
%int0_2678 = torch.constant.int 0 | |
%3200 = torch.aten.unsqueeze %161, %int0_2678 : !torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ> | |
%int4_2679 = torch.constant.int 4 | |
%int14336_2680 = torch.constant.int 14336 | |
%int4096_2681 = torch.constant.int 4096 | |
%3201 = torch.prim.ListConstruct %int4_2679, %int14336_2680, %int4096_2681 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%false_2682 = torch.constant.bool false | |
%3202 = torch.aten.expand %3200, %3201, %false_2682 : !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ> | |
%3203 = torch_c.to_builtin_tensor %3199 : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> -> tensor<4x?x4096xf8E4M3FNUZ> | |
%3204 = torch_c.to_builtin_tensor %3202 : !torch.vtensor<[4,14336,4096],f8E4M3FNUZ> -> tensor<4x14336x4096xf8E4M3FNUZ> | |
%3205 = util.call @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ_f32(%3203, %3204) : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x?x14336xf32> | |
%3206 = torch_c.from_builtin_tensor %3205 : tensor<4x?x14336xf32> -> !torch.vtensor<[4,?,14336],f32> | |
torch.bind_symbolic_shape %3206, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32> | |
%3207 = torch.prim.ListConstruct : () -> !torch.list<int> | |
%3208 = torch.aten.permute %162, %3207 : !torch.vtensor<[],f32>, !torch.list<int> -> !torch.vtensor<[],f32> | |
%3209 = torch.aten.mul.Tensor %160, %3208 : !torch.vtensor<[],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[],f32> | |
%int6_2683 = torch.constant.int 6 | |
%3210 = torch.prims.convert_element_type %3206, %int6_2683 : !torch.vtensor<[4,?,14336],f32>, !torch.int -> !torch.vtensor<[4,?,14336],f32> | |
torch.bind_symbolic_shape %3210, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32> | |
%3211 = torch.aten.mul.Tensor %3210, %3209 : !torch.vtensor<[4,?,14336],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,14336],f32> | |
torch.bind_symbolic_shape %3211, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32> | |
%3212 = torch.aten.silu %3211 : !torch.vtensor<[4,?,14336],f32> -> !torch.vtensor<[4,?,14336],f32> | |
torch.bind_symbolic_shape %3212, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32> | |
%3213 = torch.aten.div.Tensor %3196, %163 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %3213, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%float-2.400000e02_2684 = torch.constant.float -2.400000e+02 | |
%float2.400000e02_2685 = torch.constant.float 2.400000e+02 | |
%3214 = torch.aten.clamp %3213, %float-2.400000e02_2684, %float2.400000e02_2685 : !torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %3214, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int26_2686 = torch.constant.int 26 | |
%3215 = torch.prims.convert_element_type %3214, %int26_2686 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %3215, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> | |
%int0_2687 = torch.constant.int 0 | |
%3216 = torch.aten.unsqueeze %164, %int0_2687 : !torch.vtensor<[14336,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,14336,4096],f8E4M3FNUZ> | |
%int4_2688 = torch.constant.int 4 | |
%int14336_2689 = torch.constant.int 14336 | |
%int4096_2690 = torch.constant.int 4096 | |
%3217 = torch.prim.ListConstruct %int4_2688, %int14336_2689, %int4096_2690 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%false_2691 = torch.constant.bool false | |
%3218 = torch.aten.expand %3216, %3217, %false_2691 : !torch.vtensor<[1,14336,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,14336,4096],f8E4M3FNUZ> | |
%3219 = torch_c.to_builtin_tensor %3215 : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> -> tensor<4x?x4096xf8E4M3FNUZ> | |
%3220 = torch_c.to_builtin_tensor %3218 : !torch.vtensor<[4,14336,4096],f8E4M3FNUZ> -> tensor<4x14336x4096xf8E4M3FNUZ> | |
%3221 = util.call @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x14336x4096xf8E4M3FNUZ_f32(%3219, %3220) : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x14336x4096xf8E4M3FNUZ>) -> tensor<4x?x14336xf32> | |
%3222 = torch_c.from_builtin_tensor %3221 : tensor<4x?x14336xf32> -> !torch.vtensor<[4,?,14336],f32> | |
torch.bind_symbolic_shape %3222, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32> | |
%3223 = torch.prim.ListConstruct : () -> !torch.list<int> | |
%3224 = torch.aten.permute %165, %3223 : !torch.vtensor<[],f32>, !torch.list<int> -> !torch.vtensor<[],f32> | |
%3225 = torch.aten.mul.Tensor %163, %3224 : !torch.vtensor<[],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[],f32> | |
%int6_2692 = torch.constant.int 6 | |
%3226 = torch.prims.convert_element_type %3222, %int6_2692 : !torch.vtensor<[4,?,14336],f32>, !torch.int -> !torch.vtensor<[4,?,14336],f32> | |
torch.bind_symbolic_shape %3226, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32> | |
%3227 = torch.aten.mul.Tensor %3226, %3225 : !torch.vtensor<[4,?,14336],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,14336],f32> | |
torch.bind_symbolic_shape %3227, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32> | |
%3228 = torch.aten.mul.Tensor %3212, %3227 : !torch.vtensor<[4,?,14336],f32>, !torch.vtensor<[4,?,14336],f32> -> !torch.vtensor<[4,?,14336],f32> | |
torch.bind_symbolic_shape %3228, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32> | |
%3229 = torch.aten.div.Tensor %3228, %166 : !torch.vtensor<[4,?,14336],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,14336],f32> | |
torch.bind_symbolic_shape %3229, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32> | |
%float-2.400000e02_2693 = torch.constant.float -2.400000e+02 | |
%float2.400000e02_2694 = torch.constant.float 2.400000e+02 | |
%3230 = torch.aten.clamp %3229, %float-2.400000e02_2693, %float2.400000e02_2694 : !torch.vtensor<[4,?,14336],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,14336],f32> | |
torch.bind_symbolic_shape %3230, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f32> | |
%int26_2695 = torch.constant.int 26 | |
%3231 = torch.prims.convert_element_type %3230, %int26_2695 : !torch.vtensor<[4,?,14336],f32>, !torch.int -> !torch.vtensor<[4,?,14336],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %3231, [%773], affine_map<()[s0] -> (4, s0 * 32, 14336)> : !torch.vtensor<[4,?,14336],f8E4M3FNUZ> | |
%int0_2696 = torch.constant.int 0 | |
%3232 = torch.aten.unsqueeze %167, %int0_2696 : !torch.vtensor<[4096,14336],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,4096,14336],f8E4M3FNUZ> | |
%int4_2697 = torch.constant.int 4 | |
%int4096_2698 = torch.constant.int 4096 | |
%int14336_2699 = torch.constant.int 14336 | |
%3233 = torch.prim.ListConstruct %int4_2697, %int4096_2698, %int14336_2699 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%false_2700 = torch.constant.bool false | |
%3234 = torch.aten.expand %3232, %3233, %false_2700 : !torch.vtensor<[1,4096,14336],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,4096,14336],f8E4M3FNUZ> | |
%3235 = torch_c.to_builtin_tensor %3231 : !torch.vtensor<[4,?,14336],f8E4M3FNUZ> -> tensor<4x?x14336xf8E4M3FNUZ> | |
%3236 = torch_c.to_builtin_tensor %3234 : !torch.vtensor<[4,4096,14336],f8E4M3FNUZ> -> tensor<4x4096x14336xf8E4M3FNUZ> | |
%3237 = util.call @sharktank_batch_matmul_transpose_b_L4xDx14336xf8E4M3FNUZ_R4x4096x14336xf8E4M3FNUZ_f32(%3235, %3236) : (tensor<4x?x14336xf8E4M3FNUZ>, tensor<4x4096x14336xf8E4M3FNUZ>) -> tensor<4x?x4096xf32> | |
%3238 = torch_c.from_builtin_tensor %3237 : tensor<4x?x4096xf32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %3238, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%3239 = torch.prim.ListConstruct : () -> !torch.list<int> | |
%3240 = torch.aten.permute %168, %3239 : !torch.vtensor<[],f32>, !torch.list<int> -> !torch.vtensor<[],f32> | |
%3241 = torch.aten.mul.Tensor %166, %3240 : !torch.vtensor<[],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[],f32> | |
%int6_2701 = torch.constant.int 6 | |
%3242 = torch.prims.convert_element_type %3238, %int6_2701 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %3242, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%3243 = torch.aten.mul.Tensor %3242, %3241 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %3243, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int1_2702 = torch.constant.int 1 | |
%3244 = torch.aten.add.Tensor %3186, %3243, %int1_2702 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %3244, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int6_2703 = torch.constant.int 6 | |
%3245 = torch.prims.convert_element_type %3244, %int6_2703 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %3245, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int2_2704 = torch.constant.int 2 | |
%3246 = torch.aten.pow.Tensor_Scalar %3245, %int2_2704 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %3246, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int-1_2705 = torch.constant.int -1 | |
%3247 = torch.prim.ListConstruct %int-1_2705 : (!torch.int) -> !torch.list<int> | |
%true_2706 = torch.constant.bool true | |
%none_2707 = torch.constant.none | |
%3248 = torch.aten.mean.dim %3246, %3247, %true_2706, %none_2707 : !torch.vtensor<[4,?,4096],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[4,?,1],f32> | |
torch.bind_symbolic_shape %3248, [%773], affine_map<()[s0] -> (4, s0 * 32, 1)> : !torch.vtensor<[4,?,1],f32> | |
%float1.000000e-05_2708 = torch.constant.float 1.000000e-05 | |
%int1_2709 = torch.constant.int 1 | |
%3249 = torch.aten.add.Scalar %3248, %float1.000000e-05_2708, %int1_2709 : !torch.vtensor<[4,?,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[4,?,1],f32> | |
torch.bind_symbolic_shape %3249, [%773], affine_map<()[s0] -> (4, s0 * 32, 1)> : !torch.vtensor<[4,?,1],f32> | |
%3250 = torch.aten.rsqrt %3249 : !torch.vtensor<[4,?,1],f32> -> !torch.vtensor<[4,?,1],f32> | |
torch.bind_symbolic_shape %3250, [%773], affine_map<()[s0] -> (4, s0 * 32, 1)> : !torch.vtensor<[4,?,1],f32> | |
%3251 = torch.aten.mul.Tensor %3245, %3250 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[4,?,1],f32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %3251, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int6_2710 = torch.constant.int 6 | |
%3252 = torch.prims.convert_element_type %3251, %int6_2710 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %3252, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%3253 = torch.aten.mul.Tensor %169, %3252 : !torch.vtensor<[4096],bf16>, !torch.vtensor<[4,?,4096],f32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %3253, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int6_2711 = torch.constant.int 6 | |
%3254 = torch.prims.convert_element_type %3253, %int6_2711 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %3254, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%3255 = torch.aten.div.Tensor %3254, %170 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %3255, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%float-2.400000e02_2712 = torch.constant.float -2.400000e+02 | |
%float2.400000e02_2713 = torch.constant.float 2.400000e+02 | |
%3256 = torch.aten.clamp %3255, %float-2.400000e02_2712, %float2.400000e02_2713 : !torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %3256, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int26_2714 = torch.constant.int 26 | |
%3257 = torch.prims.convert_element_type %3256, %int26_2714 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %3257, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> | |
%int0_2715 = torch.constant.int 0 | |
%3258 = torch.aten.unsqueeze %171, %int0_2715 : !torch.vtensor<[4096,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,4096,4096],f8E4M3FNUZ> | |
%int4_2716 = torch.constant.int 4 | |
%int4096_2717 = torch.constant.int 4096 | |
%int4096_2718 = torch.constant.int 4096 | |
%3259 = torch.prim.ListConstruct %int4_2716, %int4096_2717, %int4096_2718 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%false_2719 = torch.constant.bool false | |
%3260 = torch.aten.expand %3258, %3259, %false_2719 : !torch.vtensor<[1,4096,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,4096,4096],f8E4M3FNUZ> | |
%3261 = torch_c.to_builtin_tensor %3257 : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> -> tensor<4x?x4096xf8E4M3FNUZ> | |
%3262 = torch_c.to_builtin_tensor %3260 : !torch.vtensor<[4,4096,4096],f8E4M3FNUZ> -> tensor<4x4096x4096xf8E4M3FNUZ> | |
%3263 = util.call @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x4096x4096xf8E4M3FNUZ_f32(%3261, %3262) : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x4096x4096xf8E4M3FNUZ>) -> tensor<4x?x4096xf32> | |
%3264 = torch_c.from_builtin_tensor %3263 : tensor<4x?x4096xf32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %3264, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%3265 = torch.aten.div.Tensor %3264, %172 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %3265, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%float-2.400000e02_2720 = torch.constant.float -2.400000e+02 | |
%float2.400000e02_2721 = torch.constant.float 2.400000e+02 | |
%3266 = torch.aten.clamp %3265, %float-2.400000e02_2720, %float2.400000e02_2721 : !torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %3266, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int26_2722 = torch.constant.int 26 | |
%3267 = torch.prims.convert_element_type %3266, %int26_2722 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %3267, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> | |
%3268 = torch.aten.div.Tensor %3254, %173 : !torch.vtensor<[4,?,4096],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %3268, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%float-2.400000e02_2723 = torch.constant.float -2.400000e+02 | |
%float2.400000e02_2724 = torch.constant.float 2.400000e+02 | |
%3269 = torch.aten.clamp %3268, %float-2.400000e02_2723, %float2.400000e02_2724 : !torch.vtensor<[4,?,4096],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,4096],f32> | |
torch.bind_symbolic_shape %3269, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f32> | |
%int26_2725 = torch.constant.int 26 | |
%3270 = torch.prims.convert_element_type %3269, %int26_2725 : !torch.vtensor<[4,?,4096],f32>, !torch.int -> !torch.vtensor<[4,?,4096],f8E4M3FNUZ> | |
torch.bind_symbolic_shape %3270, [%773], affine_map<()[s0] -> (4, s0 * 32, 4096)> : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> | |
%int0_2726 = torch.constant.int 0 | |
%3271 = torch.aten.unsqueeze %174, %int0_2726 : !torch.vtensor<[1024,4096],f8E4M3FNUZ>, !torch.int -> !torch.vtensor<[1,1024,4096],f8E4M3FNUZ> | |
%int4_2727 = torch.constant.int 4 | |
%int1024_2728 = torch.constant.int 1024 | |
%int4096_2729 = torch.constant.int 4096 | |
%3272 = torch.prim.ListConstruct %int4_2727, %int1024_2728, %int4096_2729 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int> | |
%false_2730 = torch.constant.bool false | |
%3273 = torch.aten.expand %3271, %3272, %false_2730 : !torch.vtensor<[1,1024,4096],f8E4M3FNUZ>, !torch.list<int>, !torch.bool -> !torch.vtensor<[4,1024,4096],f8E4M3FNUZ> | |
%3274 = torch_c.to_builtin_tensor %3270 : !torch.vtensor<[4,?,4096],f8E4M3FNUZ> -> tensor<4x?x4096xf8E4M3FNUZ> | |
%3275 = torch_c.to_builtin_tensor %3273 : !torch.vtensor<[4,1024,4096],f8E4M3FNUZ> -> tensor<4x1024x4096xf8E4M3FNUZ> | |
%3276 = util.call @sharktank_batch_matmul_transpose_b_L4xDx4096xf8E4M3FNUZ_R4x1024x4096xf8E4M3FNUZ_f32(%3274, %3275) : (tensor<4x?x4096xf8E4M3FNUZ>, tensor<4x1024x4096xf8E4M3FNUZ>) -> tensor<4x?x1024xf32> | |
%3277 = torch_c.from_builtin_tensor %3276 : tensor<4x?x1024xf32> -> !torch.vtensor<[4,?,1024],f32> | |
torch.bind_symbolic_shape %3277, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f32> | |
%3278 = torch.aten.div.Tensor %3277, %175 : !torch.vtensor<[4,?,1024],f32>, !torch.vtensor<[],f32> -> !torch.vtensor<[4,?,1024],f32> | |
torch.bind_symbolic_shape %3278, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4,?,1024],f32> | |
%float-2.400000e02_2731 = torch.constant.float -2.400000e+02 | |
%float2.400000e02_2732 = torch.constant.float 2.400000e+02 | |
%3279 = torch.aten.clamp %3278, %float-2.400000e02_2731, %float2.400000e02_2732 : !torch.vtensor<[4,?,1024],f32>, !torch.float, !torch.float -> !torch.vtensor<[4,?,1024],f32> | |
torch.bind_symbolic_shape %3279, [%773], affine_map<()[s0] -> (4, s0 * 32, 1024)> : !torch.vtensor<[4, |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment