Skip to content

Instantly share code, notes, and snippets.

@vanbasten23
Created August 5, 2025 22:51
Show Gist options
  • Save vanbasten23/56a5cf844c0a527453a37af36efd3193 to your computer and use it in GitHub Desktop.
Save vanbasten23/56a5cf844c0a527453a37af36efd3193 to your computer and use it in GitHub Desktop.
0 -> ('', Qwen2ForCausalLM(
(model): Qwen2Model(
(embed_tokens): VocabParallelEmbedding(num_embeddings=151936, embedding_dim=2048, org_vocab_size=151936, num_embeddings_padded=151936, tp_size=1)
(layers): ModuleList(
(0-35): 36 x Qwen2DecoderLayer(
(self_attn): Qwen2Attention(
(qkv_proj): QKVParallelLinear(in_features=2048, output_features=2560, bias=True, tp_size=1, gather_output=False)
(o_proj): RowParallelLinear(input_features=2048, output_features=2048, bias=False, tp_size=1, reduce_results=True)
(rotary_emb): RotaryEmbedding(head_size=128, rotary_dim=128, max_position_embeddings=32768, base=1000000.0, is_neox_style=True)
(attn): Attention(head_size=128, num_heads=16, num_kv_heads=2, scale=0.08838834764831845, backend=PallasAttentionBackendImpl)
)
(mlp): Qwen2MLP(
(gate_up_proj): MergedColumnParallelLinear(in_features=2048, output_features=22016, bias=False, tp_size=1, gather_output=False)
(down_proj): RowParallelLinear(input_features=11008, output_features=2048, bias=False, tp_size=1, reduce_results=True)
(act_fn): SiluAndMul()
)
(input_layernorm): RMSNorm(hidden_size=2048, eps=1e-06)
(post_attention_layernorm): RMSNorm(hidden_size=2048, eps=1e-06)
)
)
(norm): RMSNorm(hidden_size=2048, eps=1e-06)
)
(lm_head): VocabParallelEmbedding(num_embeddings=151936, embedding_dim=2048, org_vocab_size=151936, num_embeddings_padded=151936, tp_size=1)
(logits_processor): LogitsProcessor(vocab_size=151936, org_vocab_size=151936, scale=1.0, logits_as_input=False)
))
1 -> ('model', Qwen2Model(
(embed_tokens): VocabParallelEmbedding(num_embeddings=151936, embedding_dim=2048, org_vocab_size=151936, num_embeddings_padded=151936, tp_size=1)
(layers): ModuleList(
(0-35): 36 x Qwen2DecoderLayer(
(self_attn): Qwen2Attention(
(qkv_proj): QKVParallelLinear(in_features=2048, output_features=2560, bias=True, tp_size=1, gather_output=False)
(o_proj): RowParallelLinear(input_features=2048, output_features=2048, bias=False, tp_size=1, reduce_results=True)
(rotary_emb): RotaryEmbedding(head_size=128, rotary_dim=128, max_position_embeddings=32768, base=1000000.0, is_neox_style=True)
(attn): Attention(head_size=128, num_heads=16, num_kv_heads=2, scale=0.08838834764831845, backend=PallasAttentionBackendImpl)
)
(mlp): Qwen2MLP(
(gate_up_proj): MergedColumnParallelLinear(in_features=2048, output_features=22016, bias=False, tp_size=1, gather_output=False)
(down_proj): RowParallelLinear(input_features=11008, output_features=2048, bias=False, tp_size=1, reduce_results=True)
(act_fn): SiluAndMul()
)
(input_layernorm): RMSNorm(hidden_size=2048, eps=1e-06)
(post_attention_layernorm): RMSNorm(hidden_size=2048, eps=1e-06)
)
)
(norm): RMSNorm(hidden_size=2048, eps=1e-06)
))
2 -> ('model.embed_tokens', VocabParallelEmbedding(num_embeddings=151936, embedding_dim=2048, org_vocab_size=151936, num_embeddings_padded=151936, tp_size=1))
3 -> ('model.layers', ModuleList(
(0-35): 36 x Qwen2DecoderLayer(
(self_attn): Qwen2Attention(
(qkv_proj): QKVParallelLinear(in_features=2048, output_features=2560, bias=True, tp_size=1, gather_output=False)
(o_proj): RowParallelLinear(input_features=2048, output_features=2048, bias=False, tp_size=1, reduce_results=True)
(rotary_emb): RotaryEmbedding(head_size=128, rotary_dim=128, max_position_embeddings=32768, base=1000000.0, is_neox_style=True)
(attn): Attention(head_size=128, num_heads=16, num_kv_heads=2, scale=0.08838834764831845, backend=PallasAttentionBackendImpl)
)
(mlp): Qwen2MLP(
(gate_up_proj): MergedColumnParallelLinear(in_features=2048, output_features=22016, bias=False, tp_size=1, gather_output=False)
(down_proj): RowParallelLinear(input_features=11008, output_features=2048, bias=False, tp_size=1, reduce_results=True)
(act_fn): SiluAndMul()
)
(input_layernorm): RMSNorm(hidden_size=2048, eps=1e-06)
(post_attention_layernorm): RMSNorm(hidden_size=2048, eps=1e-06)
)
))
4 -> ('model.layers.0', Qwen2DecoderLayer(
(self_attn): Qwen2Attention(
(qkv_proj): QKVParallelLinear(in_features=2048, output_features=2560, bias=True, tp_size=1, gather_output=False)
(o_proj): RowParallelLinear(input_features=2048, output_features=2048, bias=False, tp_size=1, reduce_results=True)
(rotary_emb): RotaryEmbedding(head_size=128, rotary_dim=128, max_position_embeddings=32768, base=1000000.0, is_neox_style=True)
(attn): Attention(head_size=128, num_heads=16, num_kv_heads=2, scale=0.08838834764831845, backend=PallasAttentionBackendImpl)
)
(mlp): Qwen2MLP(
(gate_up_proj): MergedColumnParallelLinear(in_features=2048, output_features=22016, bias=False, tp_size=1, gather_output=False)
(down_proj): RowParallelLinear(input_features=11008, output_features=2048, bias=False, tp_size=1, reduce_results=True)
(act_fn): SiluAndMul()
)
(input_layernorm): RMSNorm(hidden_size=2048, eps=1e-06)
(post_attention_layernorm): RMSNorm(hidden_size=2048, eps=1e-06)
))
5 -> ('model.layers.0.self_attn', Qwen2Attention(
(qkv_proj): QKVParallelLinear(in_features=2048, output_features=2560, bias=True, tp_size=1, gather_output=False)
(o_proj): RowParallelLinear(input_features=2048, output_features=2048, bias=False, tp_size=1, reduce_results=True)
(rotary_emb): RotaryEmbedding(head_size=128, rotary_dim=128, max_position_embeddings=32768, base=1000000.0, is_neox_style=True)
(attn): Attention(head_size=128, num_heads=16, num_kv_heads=2, scale=0.08838834764831845, backend=PallasAttentionBackendImpl)
))
6 -> ('model.layers.0.self_attn.qkv_proj', QKVParallelLinear(in_features=2048, output_features=2560, bias=True, tp_size=1, gather_output=False))
7 -> ('model.layers.0.self_attn.o_proj', RowParallelLinear(input_features=2048, output_features=2048, bias=False, tp_size=1, reduce_results=True))
8 -> ('model.layers.0.self_attn.rotary_emb', RotaryEmbedding(head_size=128, rotary_dim=128, max_position_embeddings=32768, base=1000000.0, is_neox_style=True))
9 -> ('model.layers.0.self_attn.attn', Attention(head_size=128, num_heads=16, num_kv_heads=2, scale=0.08838834764831845, backend=PallasAttentionBackendImpl))
10 -> ('model.layers.0.mlp', Qwen2MLP(
(gate_up_proj): MergedColumnParallelLinear(in_features=2048, output_features=22016, bias=False, tp_size=1, gather_output=False)
(down_proj): RowParallelLinear(input_features=11008, output_features=2048, bias=False, tp_size=1, reduce_results=True)
(act_fn): SiluAndMul()
))
11 -> ('model.layers.0.mlp.gate_up_proj', MergedColumnParallelLinear(in_features=2048, output_features=22016, bias=False, tp_size=1, gather_output=False))
12 -> ('model.layers.0.mlp.down_proj', RowParallelLinear(input_features=11008, output_features=2048, bias=False, tp_size=1, reduce_results=True))
13 -> ('model.layers.0.mlp.act_fn', SiluAndMul())
14 -> ('model.layers.0.input_layernorm', RMSNorm(hidden_size=2048, eps=1e-06))
15 -> ('model.layers.0.post_attention_layernorm', RMSNorm(hidden_size=2048, eps=1e-06))
16 -> ('model.layers.1', Qwen2DecoderLayer(
....
400 -> ('model.layers.35.post_attention_layernorm', RMSNorm(hidden_size=2048, eps=1e-06))
401 -> ('model.norm', RMSNorm(hidden_size=2048, eps=1e-06))
402 -> ('logits_processor', LogitsProcessor(vocab_size=151936, org_vocab_size=151936, scale=1.0, logits_as_input=False))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment