Last active
November 3, 2024 13:06
-
-
Save Norod/f1adbf3e0da348a1b9e335859e0e5bc6 to your computer and use it in GitHub Desktop.
A script which 'converts' an existing GPT-Neo model into GPT2 architecture with a modified traget_positions value. The output GPT2 model will then need further training using the same tokenizer as the original model to recover from the local to global attention switching
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import torch | |
from transformers import GPT2LMHeadModel, GPTNeoForCausalLM, GPT2Config | |
def convert_neo_to_gpt2(neo_model_path, output_path, target_positions=1024): | |
# Load the trained GPT-Neo model | |
neo_model = GPTNeoForCausalLM.from_pretrained(neo_model_path) | |
# Create a GPT-2 config matching GPT-Neo's structure but with reduced position embeddings | |
gpt2_config = GPT2Config( | |
vocab_size=neo_model.config.vocab_size, | |
n_positions=target_positions, # Set to 1024 instead of original 2048 | |
n_embd=neo_model.config.hidden_size, | |
n_layer=neo_model.config.num_layers, | |
n_head=neo_model.config.num_heads, | |
n_inner=neo_model.config.intermediate_size if hasattr(neo_model.config, 'intermediate_size') else 4 * neo_model.config.hidden_size, | |
activation_function='gelu', | |
resid_pdrop=neo_model.config.attention_dropout if hasattr(neo_model.config, 'attention_dropout') else 0.1, | |
embd_pdrop=neo_model.config.embedding_dropout if hasattr(neo_model.config, 'embedding_dropout') else 0.1, | |
attn_pdrop=neo_model.config.attention_dropout if hasattr(neo_model.config, 'attention_dropout') else 0.1, | |
layer_norm_epsilon=neo_model.config.layer_norm_epsilon if hasattr(neo_model.config, 'layer_norm_epsilon') else 1e-5, | |
) | |
# Instantiate a new GPT-2 model with the modified configuration | |
gpt2_model = GPT2LMHeadModel(gpt2_config) | |
# Copy embeddings | |
gpt2_model.transformer.wte.weight.data = neo_model.transformer.wte.weight.data[:target_positions, :].clone() | |
gpt2_model.transformer.wpe.weight.data = neo_model.transformer.wpe.weight.data[:target_positions, :].clone() | |
# Copy layer weights and biases | |
for i in range(min(len(gpt2_model.transformer.h), len(neo_model.transformer.h))): | |
# Copy layer norms | |
gpt2_model.transformer.h[i].ln_1.weight.data = neo_model.transformer.h[i].ln_1.weight.data.clone() | |
gpt2_model.transformer.h[i].ln_1.bias.data = neo_model.transformer.h[i].ln_1.bias.data.clone() | |
gpt2_model.transformer.h[i].ln_2.weight.data = neo_model.transformer.h[i].ln_2.weight.data.clone() | |
gpt2_model.transformer.h[i].ln_2.bias.data = neo_model.transformer.h[i].ln_2.bias.data.clone() | |
# Attention weights | |
hidden_size = neo_model.config.hidden_size | |
q_weight = neo_model.transformer.h[i].attn.attention.q_proj.weight.data | |
k_weight = neo_model.transformer.h[i].attn.attention.k_proj.weight.data | |
v_weight = neo_model.transformer.h[i].attn.attention.v_proj.weight.data | |
# Stack the weights side by side and transpose | |
qkv_weights = torch.cat([q_weight, k_weight, v_weight], dim=0).T | |
gpt2_model.transformer.h[i].attn.c_attn.weight.data = qkv_weights.clone() | |
# Handle biases | |
q_bias = neo_model.transformer.h[i].attn.attention.q_proj.bias | |
k_bias = neo_model.transformer.h[i].attn.attention.k_proj.bias | |
v_bias = neo_model.transformer.h[i].attn.attention.v_proj.bias | |
# Ensure biases aren’t None | |
q_bias = q_bias if q_bias is not None else torch.zeros(hidden_size) | |
k_bias = k_bias if k_bias is not None else torch.zeros(hidden_size) | |
v_bias = v_bias if v_bias is not None else torch.zeros(hidden_size) | |
qkv_biases = torch.cat([q_bias, k_bias, v_bias]) | |
gpt2_model.transformer.h[i].attn.c_attn.bias.data = qkv_biases.clone() | |
# Attention output projection (needs transpose) | |
gpt2_model.transformer.h[i].attn.c_proj.weight.data = \ | |
neo_model.transformer.h[i].attn.attention.out_proj.weight.data.T.clone() | |
gpt2_model.transformer.h[i].attn.c_proj.bias.data = \ | |
neo_model.transformer.h[i].attn.attention.out_proj.bias.data.clone() | |
# MLP layers (need transpose) | |
# For c_fc: GPT2 expects [n_embd, 4*n_embd], Neo has [4*n_embd, n_embd] | |
gpt2_model.transformer.h[i].mlp.c_fc.weight.data = \ | |
neo_model.transformer.h[i].mlp.c_fc.weight.data.T.clone() | |
gpt2_model.transformer.h[i].mlp.c_fc.bias.data = \ | |
neo_model.transformer.h[i].mlp.c_fc.bias.data.clone() | |
# For c_proj: GPT2 expects [4*n_embd, n_embd], Neo has [n_embd, 4*n_embd] | |
gpt2_model.transformer.h[i].mlp.c_proj.weight.data = \ | |
neo_model.transformer.h[i].mlp.c_proj.weight.data.T.clone() | |
gpt2_model.transformer.h[i].mlp.c_proj.bias.data = \ | |
neo_model.transformer.h[i].mlp.c_proj.bias.data.clone() | |
# Copy final layer norm | |
gpt2_model.transformer.ln_f.weight.data = neo_model.transformer.ln_f.weight.data.clone() | |
gpt2_model.transformer.ln_f.bias.data = neo_model.transformer.ln_f.bias.data.clone() | |
# Copy LM head | |
gpt2_model.lm_head.weight.data = neo_model.lm_head.weight.data.clone() | |
# Save the modified GPT-2 model | |
gpt2_model.save_pretrained(output_path) | |
print(f"Model successfully converted and saved to {output_path}") | |
# Example usage | |
convert_neo_to_gpt2("your-gpt_neo-input-model", "your-gpt2-output-model", target_positions=1024) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment