Created
June 2, 2025 03:32
-
-
Save vanbasten23/d6f904b54461e7afa25b9d4b482c52f2 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
xw32 printing named modules | |
LlamaForCausalLM( | |
(model): LlamaModel( | |
(embed_tokens): VocabParallelEmbedding(num_embeddings=128256, embedding_dim=4096, org_vocab_size=128256, num_embeddings_padded=128256, tp_size=1) | |
(layers): ModuleList( | |
(0-31): 32 x LlamaDecoderLayer( | |
(self_attn): LlamaAttention( | |
(qkv_proj): QKVParallelLinear(in_features=4096, output_features=6144, bias=False, tp_size=1, gather_output=False) | |
(o_proj): RowParallelLinear(input_features=4096, output_features=4096, bias=False, tp_size=1, reduce_results=True) | |
(rotary_emb): Llama3RotaryEmbedding(head_size=128, rotary_dim=128, max_position_embeddings=131072, base=500000.0, is_neox_style=True) | |
(attn): Attention(head_size=128, num_heads=32, num_kv_heads=8, scale=0.08838834764831845, backend=PallasAttentionBackendImpl) | |
) | |
(mlp): LlamaMLP( | |
(gate_up_proj): MergedColumnParallelLinear(in_features=4096, output_features=28672, bias=False, tp_size=1, gather_output=False) | |
(down_proj): RowParallelLinear(input_features=14336, output_features=4096, bias=False, tp_size=1, reduce_results=True) | |
(act_fn): SiluAndMul() | |
) | |
(input_layernorm): RMSNorm(hidden_size=4096, eps=1e-05) | |
(post_attention_layernorm): RMSNorm(hidden_size=4096, eps=1e-05) | |
) | |
) | |
(norm): RMSNorm(hidden_size=4096, eps=1e-05) | |
) | |
(lm_head): ParallelLMHead(num_embeddings=128256, embedding_dim=4096, org_vocab_size=128256, num_embeddings_padded=128256, tp_size=1) | |
(logits_processor): LogitsProcessor(vocab_size=128256, forg_vocab_size=128256, scale=1.0, logits_as_input=False) | |
) | |
model LlamaModel( | |
(embed_tokens): VocabParallelEmbedding(num_embeddings=128256, embedding_dim=4096, org_vocab_size=128256, num_embeddings_padded=128256, tp_size=1) | |
(layers): ModuleList( | |
(0-31): 32 x LlamaDecoderLayer( | |
(self_attn): LlamaAttention( | |
(qkv_proj): QKVParallelLinear(in_features=4096, output_features=6144, bias=False, tp_size=1, gather_output=False) | |
(o_proj): RowParallelLinear(input_features=4096, output_features=4096, bias=False, tp_size=1, reduce_results=True) | |
(rotary_emb): Llama3RotaryEmbedding(head_size=128, rotary_dim=128, max_position_embeddings=131072, base=500000.0, is_neox_style=True) | |
(attn): Attention(head_size=128, num_heads=32, num_kv_heads=8, scale=0.08838834764831845, backend=PallasAttentionBackendImpl) | |
) | |
(mlp): LlamaMLP( | |
(gate_up_proj): MergedColumnParallelLinear(in_features=4096, output_features=28672, bias=False, tp_size=1, gather_output=False) | |
(down_proj): RowParallelLinear(input_features=14336, output_features=4096, bias=False, tp_size=1, reduce_results=True) | |
(act_fn): SiluAndMul() | |
) | |
(input_layernorm): RMSNorm(hidden_size=4096, eps=1e-05) | |
(post_attention_layernorm): RMSNorm(hidden_size=4096, eps=1e-05) | |
) | |
) | |
(norm): RMSNorm(hidden_size=4096, eps=1e-05) | |
) | |
model.embed_tokens VocabParallelEmbedding(num_embeddings=128256, embedding_dim=4096, org_vocab_size=128256, num_embeddings_padded=128256, tp_size=1) | |
model.layers ModuleList( | |
(0-31): 32 x LlamaDecoderLayer( | |
(self_attn): LlamaAttention( | |
(qkv_proj): QKVParallelLinear(in_features=4096, output_features=6144, bias=False, tp_size=1, gather_output=False) | |
(o_proj): RowParallelLinear(input_features=4096, output_features=4096, bias=False, tp_size=1, reduce_results=True) | |
(rotary_emb): Llama3RotaryEmbedding(head_size=128, rotary_dim=128, max_position_embeddings=131072, base=500000.0, is_neox_style=True) | |
(attn): Attention(head_size=128, num_heads=32, num_kv_heads=8, scale=0.08838834764831845, backend=PallasAttentionBackendImpl) | |
) | |
(mlp): LlamaMLP( | |
(gate_up_proj): MergedColumnParallelLinear(in_features=4096, output_features=28672, bias=False, tp_size=1, gather_output=False) | |
(down_proj): RowParallelLinear(input_features=14336, output_features=4096, bias=False, tp_size=1, reduce_results=True) | |
(act_fn): SiluAndMul() | |
) | |
(input_layernorm): RMSNorm(hidden_size=4096, eps=1e-05) | |
(post_attention_layernorm): RMSNorm(hidden_size=4096, eps=1e-05) | |
) | |
) | |
model.layers.0 LlamaDecoderLayer( | |
(self_attn): LlamaAttention( | |
(qkv_proj): QKVParallelLinear(in_features=4096, output_features=6144, bias=False, tp_size=1, gather_output=False) | |
(o_proj): RowParallelLinear(input_features=4096, output_features=4096, bias=False, tp_size=1, reduce_results=True) | |
(rotary_emb): Llama3RotaryEmbedding(head_size=128, rotary_dim=128, max_position_embeddings=131072, base=500000.0, is_neox_style=True) | |
(attn): Attention(head_size=128, num_heads=32, num_kv_heads=8, scale=0.08838834764831845, backend=PallasAttentionBackendImpl) | |
) | |
(mlp): LlamaMLP( | |
(gate_up_proj): MergedColumnParallelLinear(in_features=4096, output_features=28672, bias=False, tp_size=1, gather_output=False) | |
(down_proj): RowParallelLinear(input_features=14336, output_features=4096, bias=False, tp_size=1, reduce_results=True) | |
(act_fn): SiluAndMul() | |
) | |
(input_layernorm): RMSNorm(hidden_size=4096, eps=1e-05) | |
(post_attention_layernorm): RMSNorm(hidden_size=4096, eps=1e-05) | |
) | |
model.layers.0.self_attn LlamaAttention( | |
(qkv_proj): QKVParallelLinear(in_features=4096, output_features=6144, bias=False, tp_size=1, gather_output=False) | |
(o_proj): RowParallelLinear(input_features=4096, output_features=4096, bias=False, tp_size=1, reduce_results=True) | |
(rotary_emb): Llama3RotaryEmbedding(head_size=128, rotary_dim=128, max_position_embeddings=131072, base=500000.0, is_neox_style=True) | |
(attn): Attention(head_size=128, num_heads=32, num_kv_heads=8, scale=0.08838834764831845, backend=PallasAttentionBackendImpl) | |
) | |
model.layers.0.self_attn.qkv_proj QKVParallelLinear(in_features=4096, output_features=6144, bias=False, tp_size=1, gather_output=False) | |
model.layers.0.self_attn.o_proj RowParallelLinear(input_features=4096, output_features=4096, bias=False, tp_size=1, reduce_results=True) | |
model.layers.0.self_attn.rotary_emb Llama3RotaryEmbedding(head_size=128, rotary_dim=128, max_position_embeddings=131072, base=500000.0, is_neox_style=True) | |
model.layers.0.self_attn.attn Attention(head_size=128, num_heads=32, num_kv_heads=8, scale=0.08838834764831845, backend=PallasAttentionBackendImpl) | |
model.layers.0.mlp LlamaMLP( | |
(gate_up_proj): MergedColumnParallelLinear(in_features=4096, output_features=28672, bias=False, tp_size=1, gather_output=False) | |
(down_proj): RowParallelLinear(input_features=14336, output_features=4096, bias=False, tp_size=1, reduce_results=True) | |
(act_fn): SiluAndMul() | |
) | |
model.layers.0.mlp.gate_up_proj MergedColumnParallelLinear(in_features=4096, output_features=28672, bias=False, tp_size=1, gather_output=False) | |
model.layers.0.mlp.down_proj RowParallelLinear(input_features=14336, output_features=4096, bias=False, tp_size=1, reduce_results=True) | |
model.layers.0.mlp.act_fn SiluAndMul() | |
model.layers.0.input_layernorm RMSNorm(hidden_size=4096, eps=1e-05) | |
model.layers.0.post_attention_layernorm RMSNorm(hidden_size=4096, eps=1e-05) | |
model.layers.1 LlamaDecoderLayer( | |
(self_attn): LlamaAttention( | |
(qkv_proj): QKVParallelLinear(in_features=4096, output_features=6144, bias=False, tp_size=1, gather_output=False) | |
(o_proj): RowParallelLinear(input_features=4096, output_features=4096, bias=False, tp_size=1, reduce_results=True) | |
(rotary_emb): Llama3RotaryEmbedding(head_size=128, rotary_dim=128, max_position_embeddings=131072, base=500000.0, is_neox_style=True) | |
(attn): Attention(head_size=128, num_heads=32, num_kv_heads=8, scale=0.08838834764831845, backend=PallasAttentionBackendImpl) | |
) | |
(mlp): LlamaMLP( | |
(gate_up_proj): MergedColumnParallelLinear(in_features=4096, output_features=28672, bias=False, tp_size=1, gather_output=False) | |
(down_proj): RowParallelLinear(input_features=14336, output_features=4096, bias=False, tp_size=1, reduce_results=True) | |
(act_fn): SiluAndMul() | |
) | |
(input_layernorm): RMSNorm(hidden_size=4096, eps=1e-05) | |
(post_attention_layernorm): RMSNorm(hidden_size=4096, eps=1e-05) | |
) | |
model.layers.1.self_attn LlamaAttention( | |
(qkv_proj): QKVParallelLinear(in_features=4096, output_features=6144, bias=False, tp_size=1, gather_output=False) | |
(o_proj): RowParallelLinear(input_features=4096, output_features=4096, bias=False, tp_size=1, reduce_results=True) | |
(rotary_emb): Llama3RotaryEmbedding(head_size=128, rotary_dim=128, max_position_embeddings=131072, base=500000.0, is_neox_style=True) | |
(attn): Attention(head_size=128, num_heads=32, num_kv_heads=8, scale=0.08838834764831845, backend=PallasAttentionBackendImpl) | |
) | |
model.layers.1.self_attn.qkv_proj QKVParallelLinear(in_features=4096, output_features=6144, bias=False, tp_size=1, gather_output=False) | |
model.layers.1.self_attn.o_proj RowParallelLinear(input_features=4096, output_features=4096, bias=False, tp_size=1, reduce_results=True) | |
model.layers.1.self_attn.attn Attention(head_size=128, num_heads=32, num_kv_heads=8, scale=0.08838834764831845, backend=PallasAttentionBackendImpl) | |
model.layers.1.mlp LlamaMLP( | |
(gate_up_proj): MergedColumnParallelLinear(in_features=4096, output_features=28672, bias=False, tp_size=1, gather_output=False) | |
(down_proj): RowParallelLinear(input_features=14336, output_features=4096, bias=False, tp_size=1, reduce_results=True) | |
(act_fn): SiluAndMul() | |
) | |
model.layers.1.mlp.gate_up_proj MergedColumnParallelLinear(in_features=4096, output_features=28672, bias=False, tp_size=1, gather_output=False) | |
model.layers.1.mlp.down_proj RowParallelLinear(input_features=14336, output_features=4096, bias=False, tp_size=1, reduce_results=True) | |
model.layers.1.mlp.act_fn SiluAndMul() | |
model.layers.1.input_layernorm RMSNorm(hidden_size=4096, eps=1e-05) | |
model.layers.1.post_attention_layernorm RMSNorm(hidden_size=4096, eps=1e-05) | |
model.layers.2 LlamaDecoderLayer( | |
(self_attn): LlamaAttention( | |
(qkv_proj): QKVParallelLinear(in_features=4096, output_features=6144, bias=False, tp_size=1, gather_output=False) | |
(o_proj): RowParallelLinear(input_features=4096, output_features=4096, bias=False, tp_size=1, reduce_results=True) | |
(rotary_emb): Llama3RotaryEmbedding(head_size=128, rotary_dim=128, max_position_embeddings=131072, base=500000.0, is_neox_style=True) | |
(attn): Attention(head_size=128, num_heads=32, num_kv_heads=8, scale=0.08838834764831845, backend=PallasAttentionBackendImpl) | |
) | |
(mlp): LlamaMLP( | |
(gate_up_proj): MergedColumnParallelLinear(in_features=4096, output_features=28672, bias=False, tp_size=1, gather_output=False) | |
(down_proj): RowParallelLinear(input_features=14336, output_features=4096, bias=False, tp_size=1, reduce_results=True) | |
(act_fn): SiluAndMul() | |
) | |
(input_layernorm): RMSNorm(hidden_size=4096, eps=1e-05) | |
(post_attention_layernorm): RMSNorm(hidden_size=4096, eps=1e-05) | |
) | |
model.layers.2.self_attn LlamaAttention( | |
(qkv_proj): QKVParallelLinear(in_features=4096, output_features=6144, bias=False, tp_size=1, gather_output=False) | |
(o_proj): RowParallelLinear(input_features=4096, output_features=4096, bias=False, tp_size=1, reduce_results=True) | |
(rotary_emb): Llama3RotaryEmbedding(head_size=128, rotary_dim=128, max_position_embeddings=131072, base=500000.0, is_neox_style=True) | |
(attn): Attention(head_size=128, num_heads=32, num_kv_heads=8, scale=0.08838834764831845, backend=PallasAttentionBackendImpl) | |
) | |
model.layers.2.self_attn.qkv_proj QKVParallelLinear(in_features=4096, output_features=6144, bias=False, tp_size=1, gather_output=False) | |
model.layers.2.self_attn.o_proj RowParallelLinear(input_features=4096, output_features=4096, bias=False, tp_size=1, reduce_results=True) | |
model.layers.2.self_attn.attn Attention(head_size=128, num_heads=32, num_kv_heads=8, scale=0.08838834764831845, backend=PallasAttentionBackendImpl) | |
model.layers.2.mlp LlamaMLP( | |
(gate_up_proj): MergedColumnParallelLinear(in_features=4096, output_features=28672, bias=False, tp_size=1, gather_output=False) | |
(down_proj): RowParallelLinear(input_features=14336, output_features=4096, bias=False, tp_size=1, reduce_results=True) | |
(act_fn): SiluAndMul() | |
) | |
model.layers.2.mlp.gate_up_proj MergedColumnParallelLinear(in_features=4096, output_features=28672, bias=False, tp_size=1, gather_output=False) | |
model.layers.2.mlp.down_proj RowParallelLinear(input_features=14336, output_features=4096, bias=False, tp_size=1, reduce_results=True) | |
model.layers.2.mlp.act_fn SiluAndMul() | |
model.layers.2.input_layernorm RMSNorm(hidden_size=4096, eps=1e-05) | |
model.layers.2.post_attention_layernorm RMSNorm(hidden_size=4096, eps=1e-05) | |
model.layers.3 LlamaDecoderLayer( | |
(self_attn): LlamaAttention( | |
(qkv_proj): QKVParallelLinear(in_features=4096, output_features=6144, bias=False, tp_size=1, gather_output=False) | |
(o_proj): RowParallelLinear(input_features=4096, output_features=4096, bias=False, tp_size=1, reduce_results=True) | |
(rotary_emb): Llama3RotaryEmbedding(head_size=128, rotary_dim=128, max_position_embeddings=131072, base=500000.0, is_neox_style=True) | |
(attn): Attention(head_size=128, num_heads=32, num_kv_heads=8, scale=0.08838834764831845, backend=PallasAttentionBackendImpl) | |
) | |
(mlp): LlamaMLP( | |
(gate_up_proj): MergedColumnParallelLinear(in_features=4096, output_features=28672, bias=False, tp_size=1, gather_output=False) | |
(down_proj): RowParallelLinear(input_features=14336, output_features=4096, bias=False, tp_size=1, reduce_results=True) | |
(act_fn): SiluAndMul() | |
) | |
(input_layernorm): RMSNorm(hidden_size=4096, eps=1e-05) | |
(post_attention_layernorm): RMSNorm(hidden_size=4096, eps=1e-05) | |
) | |
model.layers.3.self_attn LlamaAttention( | |
(qkv_proj): QKVParallelLinear(in_features=4096, output_features=6144, bias=False, tp_size=1, gather_output=False) | |
(o_proj): RowParallelLinear(input_features=4096, output_features=4096, bias=False, tp_size=1, reduce_results=True) | |
(rotary_emb): Llama3RotaryEmbedding(head_size=128, rotary_dim=128, max_position_embeddings=131072, base=500000.0, is_neox_style=True) | |
(attn): Attention(head_size=128, num_heads=32, num_kv_heads=8, scale=0.08838834764831845, backend=PallasAttentionBackendImpl) | |
) | |
model.layers.3.self_attn.qkv_proj QKVParallelLinear(in_features=4096, output_features=6144, bias=False, tp_size=1, gather_output=False) | |
model.layers.3.self_attn.o_proj RowParallelLinear(input_features=4096, output_features=4096, bias=False, tp_size=1, reduce_results=True) | |
model.layers.3.self_attn.attn Attention(head_size=128, num_heads=32, num_kv_heads=8, scale=0.08838834764831845, backend=PallasAttentionBackendImpl) | |
model.layers.3.mlp LlamaMLP( | |
(gate_up_proj): MergedColumnParallelLinear(in_features=4096, output_features=28672, bias=False, tp_size=1, gather_output=False) | |
(down_proj): RowParallelLinear(input_features=14336, output_features=4096, bias=False, tp_size=1, reduce_results=True) | |
(act_fn): SiluAndMul() | |
) | |
model.layers.3.mlp.gate_up_proj MergedColumnParallelLinear(in_features=4096, output_features=28672, bias=False, tp_size=1, gather_output=False) | |
model.layers.3.mlp.down_proj RowParallelLinear(input_features=14336, output_features=4096, bias=False, tp_size=1, reduce_results=True) | |
model.layers.3.mlp.act_fn SiluAndMul() | |
model.layers.3.input_layernorm RMSNorm(hidden_size=4096, eps=1e-05) | |
model.layers.3.post_attention_layernorm RMSNorm(hidden_size=4096, eps=1e-05) | |
... | |
model.layers.31 LlamaDecoderLayer( | |
(self_attn): LlamaAttention( | |
(qkv_proj): QKVParallelLinear(in_features=4096, output_features=6144, bias=False, tp_size=1, gather_output=False) | |
(o_proj): RowParallelLinear(input_features=4096, output_features=4096, bias=False, tp_size=1, reduce_results=True) | |
(rotary_emb): Llama3RotaryEmbedding(head_size=128, rotary_dim=128, max_position_embeddings=131072, base=500000.0, is_neox_style=True) | |
(attn): Attention(head_size=128, num_heads=32, num_kv_heads=8, scale=0.08838834764831845, backend=PallasAttentionBackendImpl) | |
) | |
(mlp): LlamaMLP( | |
(gate_up_proj): MergedColumnParallelLinear(in_features=4096, output_features=28672, bias=False, tp_size=1, gather_output=False) | |
(down_proj): RowParallelLinear(input_features=14336, output_features=4096, bias=False, tp_size=1, reduce_results=True) | |
(act_fn): SiluAndMul() | |
) | |
(input_layernorm): RMSNorm(hidden_size=4096, eps=1e-05) | |
(post_attention_layernorm): RMSNorm(hidden_size=4096, eps=1e-05) | |
) | |
model.layers.31.self_attn LlamaAttention( | |
(qkv_proj): QKVParallelLinear(in_features=4096, output_features=6144, bias=False, tp_size=1, gather_output=False) | |
(o_proj): RowParallelLinear(input_features=4096, output_features=4096, bias=False, tp_size=1, reduce_results=True) | |
(rotary_emb): Llama3RotaryEmbedding(head_size=128, rotary_dim=128, max_position_embeddings=131072, base=500000.0, is_neox_style=True) | |
(attn): Attention(head_size=128, num_heads=32, num_kv_heads=8, scale=0.08838834764831845, backend=PallasAttentionBackendImpl) | |
) | |
model.layers.31.self_attn.qkv_proj QKVParallelLinear(in_features=4096, output_features=6144, bias=False, tp_size=1, gather_output=False) | |
model.layers.31.self_attn.o_proj RowParallelLinear(input_features=4096, output_features=4096, bias=False, tp_size=1, reduce_results=True) | |
model.layers.31.self_attn.attn Attention(head_size=128, num_heads=32, num_kv_heads=8, scale=0.08838834764831845, backend=PallasAttentionBackendImpl) | |
model.layers.31.mlp LlamaMLP( | |
(gate_up_proj): MergedColumnParallelLinear(in_features=4096, output_features=28672, bias=False, tp_size=1, gather_output=False) | |
(down_proj): RowParallelLinear(input_features=14336, output_features=4096, bias=False, tp_size=1, reduce_results=True) | |
(act_fn): SiluAndMul() | |
) | |
model.layers.31.mlp.gate_up_proj MergedColumnParallelLinear(in_features=4096, output_features=28672, bias=False, tp_size=1, gather_output=False) | |
model.layers.31.mlp.down_proj RowParallelLinear(input_features=14336, output_features=4096, bias=False, tp_size=1, reduce_results=True) | |
model.layers.31.mlp.act_fn SiluAndMul() | |
model.layers.31.input_layernorm RMSNorm(hidden_size=4096, eps=1e-05) | |
model.layers.31.post_attention_layernorm RMSNorm(hidden_size=4096, eps=1e-05) | |
model.norm RMSNorm(hidden_size=4096, eps=1e-05) | |
lm_head ParallelLMHead(num_embeddings=128256, embedding_dim=4096, org_vocab_size=128256, num_embeddings_padded=128256, tp_size=1) | |
logits_processor LogitsProcessor(vocab_size=128256, forg_vocab_size=128256, scale=1.0, logits_as_input=False) | |
xw32 printing named parameters | |
model.embed_tokens.weight torch.Size([128256, 4096]) | |
model.layers.0.self_attn.qkv_proj.weight torch.Size([6144, 4096]) | |
model.layers.0.self_attn.o_proj.weight torch.Size([4096, 4096]) | |
model.layers.0.mlp.gate_up_proj.weight torch.Size([28672, 4096]) | |
model.layers.0.mlp.down_proj.weight torch.Size([4096, 14336]) | |
model.layers.0.input_layernorm.weight torch.Size([4096]) | |
model.layers.0.post_attention_layernorm.weight torch.Size([4096]) | |
model.layers.1.self_attn.qkv_proj.weight torch.Size([6144, 4096]) | |
model.layers.1.self_attn.o_proj.weight torch.Size([4096, 4096]) | |
model.layers.1.mlp.gate_up_proj.weight torch.Size([28672, 4096]) | |
model.layers.1.mlp.down_proj.weight torch.Size([4096, 14336]) | |
model.layers.1.input_layernorm.weight torch.Size([4096]) | |
model.layers.1.post_attention_layernorm.weight torch.Size([4096]) | |
model.layers.2.self_attn.qkv_proj.weight torch.Size([6144, 4096]) | |
model.layers.2.self_attn.o_proj.weight torch.Size([4096, 4096]) | |
model.layers.2.mlp.gate_up_proj.weight torch.Size([28672, 4096]) | |
model.layers.2.mlp.down_proj.weight torch.Size([4096, 14336]) | |
model.layers.2.input_layernorm.weight torch.Size([4096]) | |
model.layers.2.post_attention_layernorm.weight torch.Size([4096]) | |
model.layers.3.self_attn.qkv_proj.weight torch.Size([6144, 4096]) | |
model.layers.3.self_attn.o_proj.weight torch.Size([4096, 4096]) | |
model.layers.3.mlp.gate_up_proj.weight torch.Size([28672, 4096]) | |
model.layers.3.mlp.down_proj.weight torch.Size([4096, 14336]) | |
model.layers.3.input_layernorm.weight torch.Size([4096]) | |
model.layers.3.post_attention_layernorm.weight torch.Size([4096]) | |
model.layers.30.self_attn.qkv_proj.weight torch.Size([6144, 4096]) | |
model.layers.30.self_attn.o_proj.weight torch.Size([4096, 4096]) | |
model.layers.30.mlp.gate_up_proj.weight torch.Size([28672, 4096]) | |
model.layers.30.mlp.down_proj.weight torch.Size([4096, 14336]) | |
model.layers.30.input_layernorm.weight torch.Size([4096]) | |
model.layers.30.post_attention_layernorm.weight torch.Size([4096]) | |
model.layers.31.self_attn.qkv_proj.weight torch.Size([6144, 4096]) | |
model.layers.31.self_attn.o_proj.weight torch.Size([4096, 4096]) | |
model.layers.31.mlp.gate_up_proj.weight torch.Size([28672, 4096]) | |
model.layers.31.mlp.down_proj.weight torch.Size([4096, 14336]) | |
model.layers.31.input_layernorm.weight torch.Size([4096]) | |
model.layers.31.post_attention_layernorm.weight torch.Size([4096]) | |
model.norm.weight torch.Size([4096]) | |
lm_head.weight torch.Size([128256, 4096]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment