Last active
March 29, 2023 13:26
-
-
Save anzz1/6c0b38a1593879065b364bc02f2d3de4 to your computer and use it in GitHub Desktop.
ugly hardcoded hack for point-alpaca conversion
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Convert a LLaMA model checkpoint to a ggml compatible file | |
# | |
# Load the model using Torch | |
# Iterate over all variables and write them to a binary file. | |
# | |
# For each variable, write the following: | |
# - Number of dimensions (int) | |
# - Name length (int) | |
# - Dimensions (int[n_dims]) | |
# - Name (char[name_length]) | |
# - Data (float[n_dims]) | |
# | |
# At the start of the ggml file we write the model parameters | |
# and vocabulary. | |
# | |
import argparse | |
import os | |
import sys | |
import json | |
import struct | |
import numpy as np | |
import torch | |
from sentencepiece import SentencePieceProcessor | |
def parse_args(): | |
parser = argparse.ArgumentParser(description='Convert a LLaMA model checkpoint to a ggml compatible file') | |
parser.add_argument('dir_model', help='directory containing the model checkpoint') | |
parser.add_argument('ftype', help='file type (0: float32, 1: float16)', type=int, choices=[0, 1], default=1) | |
parser.add_argument('vocab_only', help='only write vocab to file', type=int, default=0, nargs='?') | |
return parser.parse_args() | |
def get_n_parts(dim): | |
mappings = {4096: 1, 5120: 2, 6656: 4, 8192: 8} | |
n_parts = mappings.get(dim) | |
if n_parts is None: | |
print(f"Invalid dim: {dim}") | |
sys.exit(1) | |
print(f"n_parts = {n_parts}\n") | |
return n_parts | |
def load_hparams_and_tokenizer(dir_model): | |
# `dir_model` is something like `models/7B` or `models/7B/`. | |
# "tokenizer.model" is expected under model's parent dir. | |
# When `dir_model` is a symlink, f"{dir_model}/../tokenizer.model" would not be found. | |
# Let's use the model's parent dir directly. | |
model_parent_dir = os.path.dirname(os.path.normpath(dir_model)) | |
fname_hparams = f"{dir_model}/params.json" | |
fname_tokenizer = f"{model_parent_dir}/tokenizer.model" | |
with open(fname_hparams, "r") as f: | |
hparams = json.load(f) | |
print(hparams) | |
tokenizer = SentencePieceProcessor(fname_tokenizer) | |
hparams.update({"vocab_size": 32000}) | |
return hparams, tokenizer | |
def write_header(fout, hparams, ftype): | |
keys = ["vocab_size", "dim", "multiple_of", "n_heads", "n_layers"] | |
values = [ | |
0x67676d66, # magic: ggmf in hex | |
1, # file version | |
*[hparams[key] for key in keys], | |
hparams["dim"] // hparams["n_heads"], # rot (obsolete) | |
ftype | |
] | |
fout.write(struct.pack("i" * len(values), *values)) | |
def write_tokens(fout, tokenizer): | |
for i in range(32000): | |
if tokenizer.is_unknown(i): | |
text = " \u2047 ".encode("utf-8") | |
elif tokenizer.is_control(i): | |
text = b"" | |
elif tokenizer.is_byte(i): | |
piece = tokenizer.id_to_piece(i) | |
if len(piece) != 6: | |
print(f"Invalid token: {piece}") | |
sys.exit(1) | |
byte_value = int(piece[3:-1], 16) | |
text = struct.pack("B", byte_value) | |
else: | |
text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8") | |
fout.write(struct.pack("i", len(text))) | |
fout.write(text) | |
fout.write(struct.pack("f", tokenizer.get_score(i))) | |
def process_and_write_variables(fout, model, ftype): | |
for name, datao in model.items(): | |
if name.endswith("freqs"): | |
continue | |
shape = datao.shape | |
# ugly hack | |
if shape[0] > 32000: | |
datao = datao[:32000] | |
print(f"Processing variable: {name} with shape: {datao.shape} and type: {datao.dtype}") | |
data = datao.numpy().squeeze() | |
n_dims = len(shape) | |
# default type is fp16 | |
ftype_cur = 1 | |
if ftype == 0 or n_dims == 1: | |
print(" Converting to float32") | |
data = data.astype(np.float32) | |
ftype_cur = 0 | |
# header | |
sname = name.encode('utf-8') | |
fout.write(struct.pack("iii", len(data.shape), len(sname), ftype_cur)) | |
for dim in reversed(data.shape): | |
fout.write(struct.pack("i", dim)) | |
fout.write(sname) | |
# data output to file | |
data.tofile(fout) | |
def main(): | |
args = parse_args() | |
dir_model = args.dir_model | |
ftype = args.ftype | |
ftype_str = ["f32", "f16"] | |
hparams, tokenizer = load_hparams_and_tokenizer(dir_model) | |
print(args) | |
# if only writing vocab to file | |
if args.vocab_only: | |
fname_model = f"{dir_model}/consolidated.00.pth" | |
fname_out = f"{dir_model}/ggml-vocab.bin" | |
print(f"Extracting only the vocab from '{fname_model}'\n") | |
model = torch.load(fname_model, map_location="cpu") | |
with open(fname_out, "wb") as fout: | |
write_header(fout, hparams, ftype) | |
write_tokens(fout, tokenizer) | |
del model | |
print(f"Done. Output file: {fname_out}\n") | |
return | |
n_parts = get_n_parts(hparams["dim"]) | |
for p in range(n_parts): | |
print(f"Processing part {p}\n") | |
fname_model = f"{dir_model}/consolidated.0{p}.pth" | |
fname_out = f"{dir_model}/ggml-model-{ftype_str[ftype]}.bin{'' if p == 0 else '.' + str(p)}" | |
model = torch.load(fname_model, map_location="cpu") | |
with open(fname_out, "wb") as fout: | |
write_header(fout, hparams, ftype) | |
write_tokens(fout, tokenizer) | |
process_and_write_variables(fout, model, ftype) | |
del model | |
print(f"Done. Output file: {fname_out}, (part {p})\n") | |
if __name__ == "__main__": | |
main() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from transformers import LlamaTokenizer, LlamaForCausalLM | |
import os | |
import json | |
import torch | |
import transformers | |
assert ( | |
"LlamaTokenizer" in transformers._import_structure["models.llama"] | |
), "LLaMA is now in HuggingFace's main branch.\nPlease reinstall it: pip uninstall transformers && pip install git+https://github.com/huggingface/transformers.git" | |
tokenizer = LlamaTokenizer.from_pretrained("./result") | |
base_model = LlamaForCausalLM.from_pretrained( | |
"./result", | |
load_in_8bit=False, | |
torch_dtype=torch.float16, | |
device_map={"": "cpu"}, | |
) | |
# merge weights | |
for layer in base_model.model.layers: | |
layer.self_attn.q_proj.merge_weights = True | |
layer.self_attn.v_proj.merge_weights = True | |
base_model.train(False) | |
base_model_sd = base_model.state_dict() | |
params = { | |
"dim": 4096, | |
"multiple_of": 256, | |
"n_heads": 32, | |
"n_layers": 32, | |
"norm_eps": 1e-06, | |
"vocab_size": 32000, | |
} | |
n_layers = params["n_layers"] | |
n_heads = params["n_heads"] | |
dim = params["dim"] | |
dims_per_head = dim // n_heads | |
base = 10000.0 | |
inv_freq = 1.0 / \ | |
(base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head)) | |
def permute(w): | |
return ( | |
w.view(n_heads, dim // n_heads // 2, 2, | |
dim).transpose(1, 2).reshape(dim, dim) | |
) | |
def unpermute(w): | |
return ( | |
w.view(n_heads, 2, dim // n_heads // 2, | |
dim).transpose(1, 2).reshape(dim, dim) | |
) | |
def translate_state_dict_key(k): | |
if k == "model.embed_tokens.weight": | |
return "tok_embeddings.weight" | |
elif k == "model.norm.weight": | |
return "norm.weight" | |
elif k == "lm_head.weight": | |
return "output.weight" | |
elif k.startswith("model.layers."): | |
layer = k.split(".")[2] | |
if k.endswith(".self_attn.q_proj.weight"): | |
return f"layers.{layer}.attention.wq.weight" | |
elif k.endswith(".self_attn.k_proj.weight"): | |
return f"layers.{layer}.attention.wk.weight" | |
elif k.endswith(".self_attn.v_proj.weight"): | |
return f"layers.{layer}.attention.wv.weight" | |
elif k.endswith(".self_attn.o_proj.weight"): | |
return f"layers.{layer}.attention.wo.weight" | |
elif k.endswith(".mlp.gate_proj.weight"): | |
return f"layers.{layer}.feed_forward.w1.weight" | |
elif k.endswith(".mlp.down_proj.weight"): | |
return f"layers.{layer}.feed_forward.w2.weight" | |
elif k.endswith(".mlp.up_proj.weight"): | |
return f"layers.{layer}.feed_forward.w3.weight" | |
elif k.endswith(".input_layernorm.weight"): | |
return f"layers.{layer}.attention_norm.weight" | |
elif k.endswith(".post_attention_layernorm.weight"): | |
return f"layers.{layer}.ffn_norm.weight" | |
elif k.endswith("rotary_emb.inv_freq") or "lora" in k: | |
return None | |
else: | |
print(layer, k) | |
raise NotImplementedError | |
else: | |
print(k) | |
raise NotImplementedError | |
new_state_dict = {} | |
for k, v in base_model_sd.items(): | |
new_k = translate_state_dict_key(k) | |
if new_k is not None: | |
if "wq" in new_k or "wk" in new_k: | |
new_state_dict[new_k] = unpermute(v) | |
else: | |
new_state_dict[new_k] = v | |
os.makedirs("./palpaca/7B", exist_ok=True) | |
torch.save(new_state_dict, "./palpaca/7B/consolidated.00.pth") | |
with open("./palpaca/7B/params.json", "w") as f: | |
json.dump(params, f) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment