Skip to content

Instantly share code, notes, and snippets.

@CoderCowMoo
Created May 5, 2024 14:17
Show Gist options
  • Save CoderCowMoo/d71a19e2995349411fde5942d761b98e to your computer and use it in GitHub Desktop.
Save CoderCowMoo/d71a19e2995349411fde5942d761b98e to your computer and use it in GitHub Desktop.
from safetensors import safe_open
import einops
import torch
from transformers import AutoModelForCausalLM
def get_orthogonalized_matrix(matrix: Float[Tensor, '... d_model'], vec: Float[Tensor, 'd_model']) -> Float[Tensor, '... d_model']:
device = matrix.device
vec = vec.to(device)
proj = einops.einsum(matrix, vec.view(-1, 1), '... d_model, d_model single -> ... single') * vec
return matrix - proj
model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-70B-Instruct").eval()
with safe_open("refusal_dir.safetensors", framework="pt", device="cpu") as f:
refusal_dir = f.get_tensor("refusal_dir")
refusal_dir = refusal_dir.cpu().float()
model.model.embed_tokens.weight.data = get_orthogonalized_matrix(model.model.embed_tokens.weight, refusal_dir)
for block in model.model.layers:
block.self_attn.o_proj.weight.data = get_orthogonalized_matrix(block.self_attn.o_proj.weight, refusal_dir)
block.mlp.down_proj.weight.data = get_orthogonalized_matrix(block.mlp.down_proj.weight.T, refusal_dir).T
model.save_pretrained("../llama-3-70b-orthogonalized")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment