Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save vanbasten23/d6f904b54461e7afa25b9d4b482c52f2 to your computer and use it in GitHub Desktop.
Save vanbasten23/d6f904b54461e7afa25b9d4b482c52f2 to your computer and use it in GitHub Desktop.
xw32 printing named modules
LlamaForCausalLM(
(model): LlamaModel(
(embed_tokens): VocabParallelEmbedding(num_embeddings=128256, embedding_dim=4096, org_vocab_size=128256, num_embeddings_padded=128256, tp_size=1)
(layers): ModuleList(
(0-31): 32 x LlamaDecoderLayer(
(self_attn): LlamaAttention(
(qkv_proj): QKVParallelLinear(in_features=4096, output_features=6144, bias=False, tp_size=1, gather_output=False)
(o_proj): RowParallelLinear(input_features=4096, output_features=4096, bias=False, tp_size=1, reduce_results=True)
(rotary_emb): Llama3RotaryEmbedding(head_size=128, rotary_dim=128, max_position_embeddings=131072, base=500000.0, is_neox_style=True)
(attn): Attention(head_size=128, num_heads=32, num_kv_heads=8, scale=0.08838834764831845, backend=PallasAttentionBackendImpl)
)
(mlp): LlamaMLP(
(gate_up_proj): MergedColumnParallelLinear(in_features=4096, output_features=28672, bias=False, tp_size=1, gather_output=False)
(down_proj): RowParallelLinear(input_features=14336, output_features=4096, bias=False, tp_size=1, reduce_results=True)
(act_fn): SiluAndMul()
)
(input_layernorm): RMSNorm(hidden_size=4096, eps=1e-05)
(post_attention_layernorm): RMSNorm(hidden_size=4096, eps=1e-05)
)
)
(norm): RMSNorm(hidden_size=4096, eps=1e-05)
)
(lm_head): ParallelLMHead(num_embeddings=128256, embedding_dim=4096, org_vocab_size=128256, num_embeddings_padded=128256, tp_size=1)
(logits_processor): LogitsProcessor(vocab_size=128256, forg_vocab_size=128256, scale=1.0, logits_as_input=False)
)
model LlamaModel(
(embed_tokens): VocabParallelEmbedding(num_embeddings=128256, embedding_dim=4096, org_vocab_size=128256, num_embeddings_padded=128256, tp_size=1)
(layers): ModuleList(
(0-31): 32 x LlamaDecoderLayer(
(self_attn): LlamaAttention(
(qkv_proj): QKVParallelLinear(in_features=4096, output_features=6144, bias=False, tp_size=1, gather_output=False)
(o_proj): RowParallelLinear(input_features=4096, output_features=4096, bias=False, tp_size=1, reduce_results=True)
(rotary_emb): Llama3RotaryEmbedding(head_size=128, rotary_dim=128, max_position_embeddings=131072, base=500000.0, is_neox_style=True)
(attn): Attention(head_size=128, num_heads=32, num_kv_heads=8, scale=0.08838834764831845, backend=PallasAttentionBackendImpl)
)
(mlp): LlamaMLP(
(gate_up_proj): MergedColumnParallelLinear(in_features=4096, output_features=28672, bias=False, tp_size=1, gather_output=False)
(down_proj): RowParallelLinear(input_features=14336, output_features=4096, bias=False, tp_size=1, reduce_results=True)
(act_fn): SiluAndMul()
)
(input_layernorm): RMSNorm(hidden_size=4096, eps=1e-05)
(post_attention_layernorm): RMSNorm(hidden_size=4096, eps=1e-05)
)
)
(norm): RMSNorm(hidden_size=4096, eps=1e-05)
)
model.embed_tokens VocabParallelEmbedding(num_embeddings=128256, embedding_dim=4096, org_vocab_size=128256, num_embeddings_padded=128256, tp_size=1)
model.layers ModuleList(
(0-31): 32 x LlamaDecoderLayer(
(self_attn): LlamaAttention(
(qkv_proj): QKVParallelLinear(in_features=4096, output_features=6144, bias=False, tp_size=1, gather_output=False)
(o_proj): RowParallelLinear(input_features=4096, output_features=4096, bias=False, tp_size=1, reduce_results=True)
(rotary_emb): Llama3RotaryEmbedding(head_size=128, rotary_dim=128, max_position_embeddings=131072, base=500000.0, is_neox_style=True)
(attn): Attention(head_size=128, num_heads=32, num_kv_heads=8, scale=0.08838834764831845, backend=PallasAttentionBackendImpl)
)
(mlp): LlamaMLP(
(gate_up_proj): MergedColumnParallelLinear(in_features=4096, output_features=28672, bias=False, tp_size=1, gather_output=False)
(down_proj): RowParallelLinear(input_features=14336, output_features=4096, bias=False, tp_size=1, reduce_results=True)
(act_fn): SiluAndMul()
)
(input_layernorm): RMSNorm(hidden_size=4096, eps=1e-05)
(post_attention_layernorm): RMSNorm(hidden_size=4096, eps=1e-05)
)
)
model.layers.0 LlamaDecoderLayer(
(self_attn): LlamaAttention(
(qkv_proj): QKVParallelLinear(in_features=4096, output_features=6144, bias=False, tp_size=1, gather_output=False)
(o_proj): RowParallelLinear(input_features=4096, output_features=4096, bias=False, tp_size=1, reduce_results=True)
(rotary_emb): Llama3RotaryEmbedding(head_size=128, rotary_dim=128, max_position_embeddings=131072, base=500000.0, is_neox_style=True)
(attn): Attention(head_size=128, num_heads=32, num_kv_heads=8, scale=0.08838834764831845, backend=PallasAttentionBackendImpl)
)
(mlp): LlamaMLP(
(gate_up_proj): MergedColumnParallelLinear(in_features=4096, output_features=28672, bias=False, tp_size=1, gather_output=False)
(down_proj): RowParallelLinear(input_features=14336, output_features=4096, bias=False, tp_size=1, reduce_results=True)
(act_fn): SiluAndMul()
)
(input_layernorm): RMSNorm(hidden_size=4096, eps=1e-05)
(post_attention_layernorm): RMSNorm(hidden_size=4096, eps=1e-05)
)
model.layers.0.self_attn LlamaAttention(
(qkv_proj): QKVParallelLinear(in_features=4096, output_features=6144, bias=False, tp_size=1, gather_output=False)
(o_proj): RowParallelLinear(input_features=4096, output_features=4096, bias=False, tp_size=1, reduce_results=True)
(rotary_emb): Llama3RotaryEmbedding(head_size=128, rotary_dim=128, max_position_embeddings=131072, base=500000.0, is_neox_style=True)
(attn): Attention(head_size=128, num_heads=32, num_kv_heads=8, scale=0.08838834764831845, backend=PallasAttentionBackendImpl)
)
model.layers.0.self_attn.qkv_proj QKVParallelLinear(in_features=4096, output_features=6144, bias=False, tp_size=1, gather_output=False)
model.layers.0.self_attn.o_proj RowParallelLinear(input_features=4096, output_features=4096, bias=False, tp_size=1, reduce_results=True)
model.layers.0.self_attn.rotary_emb Llama3RotaryEmbedding(head_size=128, rotary_dim=128, max_position_embeddings=131072, base=500000.0, is_neox_style=True)
model.layers.0.self_attn.attn Attention(head_size=128, num_heads=32, num_kv_heads=8, scale=0.08838834764831845, backend=PallasAttentionBackendImpl)
model.layers.0.mlp LlamaMLP(
(gate_up_proj): MergedColumnParallelLinear(in_features=4096, output_features=28672, bias=False, tp_size=1, gather_output=False)
(down_proj): RowParallelLinear(input_features=14336, output_features=4096, bias=False, tp_size=1, reduce_results=True)
(act_fn): SiluAndMul()
)
model.layers.0.mlp.gate_up_proj MergedColumnParallelLinear(in_features=4096, output_features=28672, bias=False, tp_size=1, gather_output=False)
model.layers.0.mlp.down_proj RowParallelLinear(input_features=14336, output_features=4096, bias=False, tp_size=1, reduce_results=True)
model.layers.0.mlp.act_fn SiluAndMul()
model.layers.0.input_layernorm RMSNorm(hidden_size=4096, eps=1e-05)
model.layers.0.post_attention_layernorm RMSNorm(hidden_size=4096, eps=1e-05)
model.layers.1 LlamaDecoderLayer(
(self_attn): LlamaAttention(
(qkv_proj): QKVParallelLinear(in_features=4096, output_features=6144, bias=False, tp_size=1, gather_output=False)
(o_proj): RowParallelLinear(input_features=4096, output_features=4096, bias=False, tp_size=1, reduce_results=True)
(rotary_emb): Llama3RotaryEmbedding(head_size=128, rotary_dim=128, max_position_embeddings=131072, base=500000.0, is_neox_style=True)
(attn): Attention(head_size=128, num_heads=32, num_kv_heads=8, scale=0.08838834764831845, backend=PallasAttentionBackendImpl)
)
(mlp): LlamaMLP(
(gate_up_proj): MergedColumnParallelLinear(in_features=4096, output_features=28672, bias=False, tp_size=1, gather_output=False)
(down_proj): RowParallelLinear(input_features=14336, output_features=4096, bias=False, tp_size=1, reduce_results=True)
(act_fn): SiluAndMul()
)
(input_layernorm): RMSNorm(hidden_size=4096, eps=1e-05)
(post_attention_layernorm): RMSNorm(hidden_size=4096, eps=1e-05)
)
model.layers.1.self_attn LlamaAttention(
(qkv_proj): QKVParallelLinear(in_features=4096, output_features=6144, bias=False, tp_size=1, gather_output=False)
(o_proj): RowParallelLinear(input_features=4096, output_features=4096, bias=False, tp_size=1, reduce_results=True)
(rotary_emb): Llama3RotaryEmbedding(head_size=128, rotary_dim=128, max_position_embeddings=131072, base=500000.0, is_neox_style=True)
(attn): Attention(head_size=128, num_heads=32, num_kv_heads=8, scale=0.08838834764831845, backend=PallasAttentionBackendImpl)
)
model.layers.1.self_attn.qkv_proj QKVParallelLinear(in_features=4096, output_features=6144, bias=False, tp_size=1, gather_output=False)
model.layers.1.self_attn.o_proj RowParallelLinear(input_features=4096, output_features=4096, bias=False, tp_size=1, reduce_results=True)
model.layers.1.self_attn.attn Attention(head_size=128, num_heads=32, num_kv_heads=8, scale=0.08838834764831845, backend=PallasAttentionBackendImpl)
model.layers.1.mlp LlamaMLP(
(gate_up_proj): MergedColumnParallelLinear(in_features=4096, output_features=28672, bias=False, tp_size=1, gather_output=False)
(down_proj): RowParallelLinear(input_features=14336, output_features=4096, bias=False, tp_size=1, reduce_results=True)
(act_fn): SiluAndMul()
)
model.layers.1.mlp.gate_up_proj MergedColumnParallelLinear(in_features=4096, output_features=28672, bias=False, tp_size=1, gather_output=False)
model.layers.1.mlp.down_proj RowParallelLinear(input_features=14336, output_features=4096, bias=False, tp_size=1, reduce_results=True)
model.layers.1.mlp.act_fn SiluAndMul()
model.layers.1.input_layernorm RMSNorm(hidden_size=4096, eps=1e-05)
model.layers.1.post_attention_layernorm RMSNorm(hidden_size=4096, eps=1e-05)
model.layers.2 LlamaDecoderLayer(
(self_attn): LlamaAttention(
(qkv_proj): QKVParallelLinear(in_features=4096, output_features=6144, bias=False, tp_size=1, gather_output=False)
(o_proj): RowParallelLinear(input_features=4096, output_features=4096, bias=False, tp_size=1, reduce_results=True)
(rotary_emb): Llama3RotaryEmbedding(head_size=128, rotary_dim=128, max_position_embeddings=131072, base=500000.0, is_neox_style=True)
(attn): Attention(head_size=128, num_heads=32, num_kv_heads=8, scale=0.08838834764831845, backend=PallasAttentionBackendImpl)
)
(mlp): LlamaMLP(
(gate_up_proj): MergedColumnParallelLinear(in_features=4096, output_features=28672, bias=False, tp_size=1, gather_output=False)
(down_proj): RowParallelLinear(input_features=14336, output_features=4096, bias=False, tp_size=1, reduce_results=True)
(act_fn): SiluAndMul()
)
(input_layernorm): RMSNorm(hidden_size=4096, eps=1e-05)
(post_attention_layernorm): RMSNorm(hidden_size=4096, eps=1e-05)
)
model.layers.2.self_attn LlamaAttention(
(qkv_proj): QKVParallelLinear(in_features=4096, output_features=6144, bias=False, tp_size=1, gather_output=False)
(o_proj): RowParallelLinear(input_features=4096, output_features=4096, bias=False, tp_size=1, reduce_results=True)
(rotary_emb): Llama3RotaryEmbedding(head_size=128, rotary_dim=128, max_position_embeddings=131072, base=500000.0, is_neox_style=True)
(attn): Attention(head_size=128, num_heads=32, num_kv_heads=8, scale=0.08838834764831845, backend=PallasAttentionBackendImpl)
)
model.layers.2.self_attn.qkv_proj QKVParallelLinear(in_features=4096, output_features=6144, bias=False, tp_size=1, gather_output=False)
model.layers.2.self_attn.o_proj RowParallelLinear(input_features=4096, output_features=4096, bias=False, tp_size=1, reduce_results=True)
model.layers.2.self_attn.attn Attention(head_size=128, num_heads=32, num_kv_heads=8, scale=0.08838834764831845, backend=PallasAttentionBackendImpl)
model.layers.2.mlp LlamaMLP(
(gate_up_proj): MergedColumnParallelLinear(in_features=4096, output_features=28672, bias=False, tp_size=1, gather_output=False)
(down_proj): RowParallelLinear(input_features=14336, output_features=4096, bias=False, tp_size=1, reduce_results=True)
(act_fn): SiluAndMul()
)
model.layers.2.mlp.gate_up_proj MergedColumnParallelLinear(in_features=4096, output_features=28672, bias=False, tp_size=1, gather_output=False)
model.layers.2.mlp.down_proj RowParallelLinear(input_features=14336, output_features=4096, bias=False, tp_size=1, reduce_results=True)
model.layers.2.mlp.act_fn SiluAndMul()
model.layers.2.input_layernorm RMSNorm(hidden_size=4096, eps=1e-05)
model.layers.2.post_attention_layernorm RMSNorm(hidden_size=4096, eps=1e-05)
model.layers.3 LlamaDecoderLayer(
(self_attn): LlamaAttention(
(qkv_proj): QKVParallelLinear(in_features=4096, output_features=6144, bias=False, tp_size=1, gather_output=False)
(o_proj): RowParallelLinear(input_features=4096, output_features=4096, bias=False, tp_size=1, reduce_results=True)
(rotary_emb): Llama3RotaryEmbedding(head_size=128, rotary_dim=128, max_position_embeddings=131072, base=500000.0, is_neox_style=True)
(attn): Attention(head_size=128, num_heads=32, num_kv_heads=8, scale=0.08838834764831845, backend=PallasAttentionBackendImpl)
)
(mlp): LlamaMLP(
(gate_up_proj): MergedColumnParallelLinear(in_features=4096, output_features=28672, bias=False, tp_size=1, gather_output=False)
(down_proj): RowParallelLinear(input_features=14336, output_features=4096, bias=False, tp_size=1, reduce_results=True)
(act_fn): SiluAndMul()
)
(input_layernorm): RMSNorm(hidden_size=4096, eps=1e-05)
(post_attention_layernorm): RMSNorm(hidden_size=4096, eps=1e-05)
)
model.layers.3.self_attn LlamaAttention(
(qkv_proj): QKVParallelLinear(in_features=4096, output_features=6144, bias=False, tp_size=1, gather_output=False)
(o_proj): RowParallelLinear(input_features=4096, output_features=4096, bias=False, tp_size=1, reduce_results=True)
(rotary_emb): Llama3RotaryEmbedding(head_size=128, rotary_dim=128, max_position_embeddings=131072, base=500000.0, is_neox_style=True)
(attn): Attention(head_size=128, num_heads=32, num_kv_heads=8, scale=0.08838834764831845, backend=PallasAttentionBackendImpl)
)
model.layers.3.self_attn.qkv_proj QKVParallelLinear(in_features=4096, output_features=6144, bias=False, tp_size=1, gather_output=False)
model.layers.3.self_attn.o_proj RowParallelLinear(input_features=4096, output_features=4096, bias=False, tp_size=1, reduce_results=True)
model.layers.3.self_attn.attn Attention(head_size=128, num_heads=32, num_kv_heads=8, scale=0.08838834764831845, backend=PallasAttentionBackendImpl)
model.layers.3.mlp LlamaMLP(
(gate_up_proj): MergedColumnParallelLinear(in_features=4096, output_features=28672, bias=False, tp_size=1, gather_output=False)
(down_proj): RowParallelLinear(input_features=14336, output_features=4096, bias=False, tp_size=1, reduce_results=True)
(act_fn): SiluAndMul()
)
model.layers.3.mlp.gate_up_proj MergedColumnParallelLinear(in_features=4096, output_features=28672, bias=False, tp_size=1, gather_output=False)
model.layers.3.mlp.down_proj RowParallelLinear(input_features=14336, output_features=4096, bias=False, tp_size=1, reduce_results=True)
model.layers.3.mlp.act_fn SiluAndMul()
model.layers.3.input_layernorm RMSNorm(hidden_size=4096, eps=1e-05)
model.layers.3.post_attention_layernorm RMSNorm(hidden_size=4096, eps=1e-05)
...
model.layers.31 LlamaDecoderLayer(
(self_attn): LlamaAttention(
(qkv_proj): QKVParallelLinear(in_features=4096, output_features=6144, bias=False, tp_size=1, gather_output=False)
(o_proj): RowParallelLinear(input_features=4096, output_features=4096, bias=False, tp_size=1, reduce_results=True)
(rotary_emb): Llama3RotaryEmbedding(head_size=128, rotary_dim=128, max_position_embeddings=131072, base=500000.0, is_neox_style=True)
(attn): Attention(head_size=128, num_heads=32, num_kv_heads=8, scale=0.08838834764831845, backend=PallasAttentionBackendImpl)
)
(mlp): LlamaMLP(
(gate_up_proj): MergedColumnParallelLinear(in_features=4096, output_features=28672, bias=False, tp_size=1, gather_output=False)
(down_proj): RowParallelLinear(input_features=14336, output_features=4096, bias=False, tp_size=1, reduce_results=True)
(act_fn): SiluAndMul()
)
(input_layernorm): RMSNorm(hidden_size=4096, eps=1e-05)
(post_attention_layernorm): RMSNorm(hidden_size=4096, eps=1e-05)
)
model.layers.31.self_attn LlamaAttention(
(qkv_proj): QKVParallelLinear(in_features=4096, output_features=6144, bias=False, tp_size=1, gather_output=False)
(o_proj): RowParallelLinear(input_features=4096, output_features=4096, bias=False, tp_size=1, reduce_results=True)
(rotary_emb): Llama3RotaryEmbedding(head_size=128, rotary_dim=128, max_position_embeddings=131072, base=500000.0, is_neox_style=True)
(attn): Attention(head_size=128, num_heads=32, num_kv_heads=8, scale=0.08838834764831845, backend=PallasAttentionBackendImpl)
)
model.layers.31.self_attn.qkv_proj QKVParallelLinear(in_features=4096, output_features=6144, bias=False, tp_size=1, gather_output=False)
model.layers.31.self_attn.o_proj RowParallelLinear(input_features=4096, output_features=4096, bias=False, tp_size=1, reduce_results=True)
model.layers.31.self_attn.attn Attention(head_size=128, num_heads=32, num_kv_heads=8, scale=0.08838834764831845, backend=PallasAttentionBackendImpl)
model.layers.31.mlp LlamaMLP(
(gate_up_proj): MergedColumnParallelLinear(in_features=4096, output_features=28672, bias=False, tp_size=1, gather_output=False)
(down_proj): RowParallelLinear(input_features=14336, output_features=4096, bias=False, tp_size=1, reduce_results=True)
(act_fn): SiluAndMul()
)
model.layers.31.mlp.gate_up_proj MergedColumnParallelLinear(in_features=4096, output_features=28672, bias=False, tp_size=1, gather_output=False)
model.layers.31.mlp.down_proj RowParallelLinear(input_features=14336, output_features=4096, bias=False, tp_size=1, reduce_results=True)
model.layers.31.mlp.act_fn SiluAndMul()
model.layers.31.input_layernorm RMSNorm(hidden_size=4096, eps=1e-05)
model.layers.31.post_attention_layernorm RMSNorm(hidden_size=4096, eps=1e-05)
model.norm RMSNorm(hidden_size=4096, eps=1e-05)
lm_head ParallelLMHead(num_embeddings=128256, embedding_dim=4096, org_vocab_size=128256, num_embeddings_padded=128256, tp_size=1)
logits_processor LogitsProcessor(vocab_size=128256, forg_vocab_size=128256, scale=1.0, logits_as_input=False)
xw32 printing named parameters
model.embed_tokens.weight torch.Size([128256, 4096])
model.layers.0.self_attn.qkv_proj.weight torch.Size([6144, 4096])
model.layers.0.self_attn.o_proj.weight torch.Size([4096, 4096])
model.layers.0.mlp.gate_up_proj.weight torch.Size([28672, 4096])
model.layers.0.mlp.down_proj.weight torch.Size([4096, 14336])
model.layers.0.input_layernorm.weight torch.Size([4096])
model.layers.0.post_attention_layernorm.weight torch.Size([4096])
model.layers.1.self_attn.qkv_proj.weight torch.Size([6144, 4096])
model.layers.1.self_attn.o_proj.weight torch.Size([4096, 4096])
model.layers.1.mlp.gate_up_proj.weight torch.Size([28672, 4096])
model.layers.1.mlp.down_proj.weight torch.Size([4096, 14336])
model.layers.1.input_layernorm.weight torch.Size([4096])
model.layers.1.post_attention_layernorm.weight torch.Size([4096])
model.layers.2.self_attn.qkv_proj.weight torch.Size([6144, 4096])
model.layers.2.self_attn.o_proj.weight torch.Size([4096, 4096])
model.layers.2.mlp.gate_up_proj.weight torch.Size([28672, 4096])
model.layers.2.mlp.down_proj.weight torch.Size([4096, 14336])
model.layers.2.input_layernorm.weight torch.Size([4096])
model.layers.2.post_attention_layernorm.weight torch.Size([4096])
model.layers.3.self_attn.qkv_proj.weight torch.Size([6144, 4096])
model.layers.3.self_attn.o_proj.weight torch.Size([4096, 4096])
model.layers.3.mlp.gate_up_proj.weight torch.Size([28672, 4096])
model.layers.3.mlp.down_proj.weight torch.Size([4096, 14336])
model.layers.3.input_layernorm.weight torch.Size([4096])
model.layers.3.post_attention_layernorm.weight torch.Size([4096])
model.layers.30.self_attn.qkv_proj.weight torch.Size([6144, 4096])
model.layers.30.self_attn.o_proj.weight torch.Size([4096, 4096])
model.layers.30.mlp.gate_up_proj.weight torch.Size([28672, 4096])
model.layers.30.mlp.down_proj.weight torch.Size([4096, 14336])
model.layers.30.input_layernorm.weight torch.Size([4096])
model.layers.30.post_attention_layernorm.weight torch.Size([4096])
model.layers.31.self_attn.qkv_proj.weight torch.Size([6144, 4096])
model.layers.31.self_attn.o_proj.weight torch.Size([4096, 4096])
model.layers.31.mlp.gate_up_proj.weight torch.Size([28672, 4096])
model.layers.31.mlp.down_proj.weight torch.Size([4096, 14336])
model.layers.31.input_layernorm.weight torch.Size([4096])
model.layers.31.post_attention_layernorm.weight torch.Size([4096])
model.norm.weight torch.Size([4096])
lm_head.weight torch.Size([128256, 4096])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment