Skip to content

Instantly share code, notes, and snippets.

@Norod
Last active November 3, 2024 13:06
Show Gist options
  • Save Norod/f1adbf3e0da348a1b9e335859e0e5bc6 to your computer and use it in GitHub Desktop.
Save Norod/f1adbf3e0da348a1b9e335859e0e5bc6 to your computer and use it in GitHub Desktop.
A script which 'converts' an existing GPT-Neo model into GPT2 architecture with a modified traget_positions value. The output GPT2 model will then need further training using the same tokenizer as the original model to recover from the local to global attention switching
import torch
from transformers import GPT2LMHeadModel, GPTNeoForCausalLM, GPT2Config
def convert_neo_to_gpt2(neo_model_path, output_path, target_positions=1024):
# Load the trained GPT-Neo model
neo_model = GPTNeoForCausalLM.from_pretrained(neo_model_path)
# Create a GPT-2 config matching GPT-Neo's structure but with reduced position embeddings
gpt2_config = GPT2Config(
vocab_size=neo_model.config.vocab_size,
n_positions=target_positions, # Set to 1024 instead of original 2048
n_embd=neo_model.config.hidden_size,
n_layer=neo_model.config.num_layers,
n_head=neo_model.config.num_heads,
n_inner=neo_model.config.intermediate_size if hasattr(neo_model.config, 'intermediate_size') else 4 * neo_model.config.hidden_size,
activation_function='gelu',
resid_pdrop=neo_model.config.attention_dropout if hasattr(neo_model.config, 'attention_dropout') else 0.1,
embd_pdrop=neo_model.config.embedding_dropout if hasattr(neo_model.config, 'embedding_dropout') else 0.1,
attn_pdrop=neo_model.config.attention_dropout if hasattr(neo_model.config, 'attention_dropout') else 0.1,
layer_norm_epsilon=neo_model.config.layer_norm_epsilon if hasattr(neo_model.config, 'layer_norm_epsilon') else 1e-5,
)
# Instantiate a new GPT-2 model with the modified configuration
gpt2_model = GPT2LMHeadModel(gpt2_config)
# Copy embeddings
gpt2_model.transformer.wte.weight.data = neo_model.transformer.wte.weight.data[:target_positions, :].clone()
gpt2_model.transformer.wpe.weight.data = neo_model.transformer.wpe.weight.data[:target_positions, :].clone()
# Copy layer weights and biases
for i in range(min(len(gpt2_model.transformer.h), len(neo_model.transformer.h))):
# Copy layer norms
gpt2_model.transformer.h[i].ln_1.weight.data = neo_model.transformer.h[i].ln_1.weight.data.clone()
gpt2_model.transformer.h[i].ln_1.bias.data = neo_model.transformer.h[i].ln_1.bias.data.clone()
gpt2_model.transformer.h[i].ln_2.weight.data = neo_model.transformer.h[i].ln_2.weight.data.clone()
gpt2_model.transformer.h[i].ln_2.bias.data = neo_model.transformer.h[i].ln_2.bias.data.clone()
# Attention weights
hidden_size = neo_model.config.hidden_size
q_weight = neo_model.transformer.h[i].attn.attention.q_proj.weight.data
k_weight = neo_model.transformer.h[i].attn.attention.k_proj.weight.data
v_weight = neo_model.transformer.h[i].attn.attention.v_proj.weight.data
# Stack the weights side by side and transpose
qkv_weights = torch.cat([q_weight, k_weight, v_weight], dim=0).T
gpt2_model.transformer.h[i].attn.c_attn.weight.data = qkv_weights.clone()
# Handle biases
q_bias = neo_model.transformer.h[i].attn.attention.q_proj.bias
k_bias = neo_model.transformer.h[i].attn.attention.k_proj.bias
v_bias = neo_model.transformer.h[i].attn.attention.v_proj.bias
# Ensure biases aren’t None
q_bias = q_bias if q_bias is not None else torch.zeros(hidden_size)
k_bias = k_bias if k_bias is not None else torch.zeros(hidden_size)
v_bias = v_bias if v_bias is not None else torch.zeros(hidden_size)
qkv_biases = torch.cat([q_bias, k_bias, v_bias])
gpt2_model.transformer.h[i].attn.c_attn.bias.data = qkv_biases.clone()
# Attention output projection (needs transpose)
gpt2_model.transformer.h[i].attn.c_proj.weight.data = \
neo_model.transformer.h[i].attn.attention.out_proj.weight.data.T.clone()
gpt2_model.transformer.h[i].attn.c_proj.bias.data = \
neo_model.transformer.h[i].attn.attention.out_proj.bias.data.clone()
# MLP layers (need transpose)
# For c_fc: GPT2 expects [n_embd, 4*n_embd], Neo has [4*n_embd, n_embd]
gpt2_model.transformer.h[i].mlp.c_fc.weight.data = \
neo_model.transformer.h[i].mlp.c_fc.weight.data.T.clone()
gpt2_model.transformer.h[i].mlp.c_fc.bias.data = \
neo_model.transformer.h[i].mlp.c_fc.bias.data.clone()
# For c_proj: GPT2 expects [4*n_embd, n_embd], Neo has [n_embd, 4*n_embd]
gpt2_model.transformer.h[i].mlp.c_proj.weight.data = \
neo_model.transformer.h[i].mlp.c_proj.weight.data.T.clone()
gpt2_model.transformer.h[i].mlp.c_proj.bias.data = \
neo_model.transformer.h[i].mlp.c_proj.bias.data.clone()
# Copy final layer norm
gpt2_model.transformer.ln_f.weight.data = neo_model.transformer.ln_f.weight.data.clone()
gpt2_model.transformer.ln_f.bias.data = neo_model.transformer.ln_f.bias.data.clone()
# Copy LM head
gpt2_model.lm_head.weight.data = neo_model.lm_head.weight.data.clone()
# Save the modified GPT-2 model
gpt2_model.save_pretrained(output_path)
print(f"Model successfully converted and saved to {output_path}")
# Example usage
convert_neo_to_gpt2("your-gpt_neo-input-model", "your-gpt2-output-model", target_positions=1024)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment