ahoho · April 2, 2023 19:35 · SurajTheCoder0 · May 29, 2025
diff --git a/convert-hf-to-pth.py b/convert-hf-to-pth.py
 # Convert a huggingface LLaMA checkpoint to an (unsharded) pytorch checkpoint
 # comes from https://github.com/tloen/alpaca-lora/blob/main/export_state_dict_checkpoint.py

 import argparse
 import json
 from pathlib import Path

 import torch
 import transformers
 from transformers import LlamaForCausalLM, LlamaTokenizer  # noqa: E402

 parser = argparse.ArgumentParser()
 parser.add_argument("base_model")
 parser.add_argument("size_key")
 args = parser.parse_args()

 tokenizer = LlamaTokenizer.from_pretrained(args.base_model)

 base_model = LlamaForCausalLM.from_pretrained(
    args.base_model,
    load_in_8bit=False,
    torch_dtype=torch.float16,
    device_map={"": "cpu"},
 )

 base_model.train(False)

 base_model_sd = base_model.state_dict()

 params_by_model = {
    "7b": {
        "dim": 4096,
        "multiple_of": 256,
        "n_heads": 32,
        "n_layers": 32,
        "norm_eps": 1e-06,
        "vocab_size": -1,
    },
    "13b": {
        "dim": 5120,
        "multiple_of": 256,
        "n_heads": 40,
        "n_layers": 40,
        "norm_eps": 1e-06,
        "vocab_size": -1,
    },
    "30b": {
        "dim": 6656,
        "multiple_of": 256,
        "n_heads": 52,
        "n_layers": 60,
        "norm_eps": 1e-06,
        "vocab_size": -1,
    },
    "65b": {
        "dim": 8192,
        "multiple_of": 256,
        "n_heads": 64,
        "n_layers": 80,
        "norm_eps": 1e-06,
        "vocab_size": -1,
    },
 }
 params = params_by_model[args.size_key.lower()]
 n_layers = params["n_layers"]
 n_heads = params["n_heads"]
 dim = params["dim"]
 dims_per_head = dim // n_heads
 base = 10000.0
 inv_freq = 1.0 / (
    base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head)
 )


 def permute(w):
    return (
        w.view(n_heads, dim // n_heads // 2, 2, dim)
        .transpose(1, 2)
        .reshape(dim, dim)
    )


 def unpermute(w):
    return (
        w.view(n_heads, 2, dim // n_heads // 2, dim)
        .transpose(1, 2)
        .reshape(dim, dim)
    )


 def translate_state_dict_key(k):  # noqa: C901
    k = k.replace("base_model.model.", "")
    if k == "model.embed_tokens.weight":
        return "tok_embeddings.weight"
    elif k == "model.norm.weight":
        return "norm.weight"
    elif k == "lm_head.weight":
        return "output.weight"
    elif k.startswith("model.layers."):
        layer = k.split(".")[2]
        if k.endswith(".self_attn.q_proj.weight"):
            return f"layers.{layer}.attention.wq.weight"
        elif k.endswith(".self_attn.k_proj.weight"):
            return f"layers.{layer}.attention.wk.weight"
        elif k.endswith(".self_attn.v_proj.weight"):
            return f"layers.{layer}.attention.wv.weight"
        elif k.endswith(".self_attn.o_proj.weight"):
            return f"layers.{layer}.attention.wo.weight"
        elif k.endswith(".mlp.gate_proj.weight"):
            return f"layers.{layer}.feed_forward.w1.weight"
        elif k.endswith(".mlp.down_proj.weight"):
            return f"layers.{layer}.feed_forward.w2.weight"
        elif k.endswith(".mlp.up_proj.weight"):
            return f"layers.{layer}.feed_forward.w3.weight"
        elif k.endswith(".input_layernorm.weight"):
            return f"layers.{layer}.attention_norm.weight"
        elif k.endswith(".post_attention_layernorm.weight"):
            return f"layers.{layer}.ffn_norm.weight"
        elif k.endswith("rotary_emb.inv_freq") or "lora" in k:
            return None
        else:
            print(layer, k)
            raise NotImplementedError
    else:
        print(k)
        raise NotImplementedError


 new_state_dict = {}
 for k, v in base_model_sd.items():
    new_k = translate_state_dict_key(k)
    if new_k is not None:
        if "wq" in new_k or "wk" in new_k:
            new_state_dict[new_k] = unpermute(v)
        else:
            new_state_dict[new_k] = v

 out_path = Path(args.base_model, "consolidated")
 out_path.mkdir(exist_ok=True)
 torch.save(new_state_dict, out_path / "consolidated.00.pth")

 with open(out_path / "params.json", "w") as f:
    json.dump(params, f)
diff --git a/convert-pth-to-ggml.py b/convert-pth-to-ggml.py
 # minor modification of the original file from llama.ccp
 # to account for the unsharded checkpoint;
 # call with `convert-pth-to-ggml.py <output dir of convert-hf-to-pth.py> 1 1`

 import argparse
 import os
 import sys
 import json
 import struct
 import numpy as np
 import torch

 from sentencepiece import SentencePieceProcessor

 QK = 32

 GGML_TYPE_Q4_0  = 0
 GGML_TYPE_Q4_1  = 1
 GGML_TYPE_I8    = 2
 GGML_TYPE_I16   = 3
 GGML_TYPE_I32   = 4
 GGML_TYPE_F16   = 5
 GGML_TYPE_F32   = 6

 WTYPES = {
    0: GGML_TYPE_F32,
    1: GGML_TYPE_F16,
    2: GGML_TYPE_Q4_0,
    3: GGML_TYPE_Q4_1,
 }

 GGML_BLCK_SIZE = {
    GGML_TYPE_Q4_0:  QK,
    GGML_TYPE_Q4_1:  QK,
    GGML_TYPE_I8:    1,
    GGML_TYPE_I16:   1,
    GGML_TYPE_I32:   1,
    GGML_TYPE_F16:   1,
    GGML_TYPE_F32:   1,
 }

 GGML_TYPE_SIZE = {
    GGML_TYPE_Q4_0: 4   + QK//2,
    GGML_TYPE_Q4_1: 4*2 + QK//2,
    GGML_TYPE_I8:   1,
    GGML_TYPE_I16:  2,
    GGML_TYPE_I32:  4,
    GGML_TYPE_F16:  2,
    GGML_TYPE_F32:  4,
 }

 def ggml_nelements(shape):
    r = 1
    for i in shape:
        r *= i
    return r

 def ggml_nbytes(shape, ftype):
    x = ggml_nelements(shape)
    t = WTYPES[ftype]
    x *= GGML_TYPE_SIZE[t]
    x //= GGML_BLCK_SIZE[t]
    return x

 def parse_args():
    parser = argparse.ArgumentParser(description='Convert a LLaMA model checkpoint to a ggml compatible file')
    parser.add_argument('dir_model',  help='directory containing the model checkpoint')
    parser.add_argument('ftype',      help='file type (0: float32, 1: float16)', type=int, choices=[0, 1], default=1)
    parser.add_argument('vocab_only', help='only write vocab to file', type=int, default=0, nargs='?')
    parser.add_argument('n_parts', type=int, default=None)
    return parser.parse_args()

 def get_n_parts(dim):
    mappings = {4096: 1, 5120: 2, 6656: 4, 8192: 8}
    n_parts = mappings.get(dim)
    if n_parts is None:
        print(f"Invalid dim: {dim}")
        sys.exit(1)

    print(f"n_parts = {n_parts}\n")
    return n_parts

 def load_hparams_and_tokenizer(dir_model):
    # `dir_model` is something like `models/7B` or `models/7B/`.
    # "tokenizer.model" is expected under model's parent dir.
    # When `dir_model` is a symlink, f"{dir_model}/../tokenizer.model" would not be found.
    # Let's use the model's parent dir directly.
    model_parent_dir = os.path.dirname(os.path.normpath(dir_model))
    fname_hparams = f"{dir_model}/params.json"
    fname_tokenizer = f"{model_parent_dir}/tokenizer.model"
    with open(fname_hparams, "r") as f:
        hparams = json.load(f)
        print(hparams)
    tokenizer = SentencePieceProcessor(fname_tokenizer)
    hparams.update({"vocab_size": tokenizer.vocab_size()})
    return hparams, tokenizer

 def write_header(fout, hparams, ftype):
    keys = ["vocab_size", "dim", "multiple_of", "n_heads", "n_layers"]
    values = [
        0x67676a74,  # magic: ggjt in hex
        1, # file version
        *[hparams[key] for key in keys],
        hparams["dim"] // hparams["n_heads"],  # rot (obsolete)
        ftype
    ]
    fout.write(struct.pack("i" * len(values), *values))

 def write_tokens(fout, tokenizer):
    for i in range(tokenizer.vocab_size()):
        if tokenizer.is_unknown(i):
            text = " \u2047 ".encode()
        elif tokenizer.is_control(i):
            text = b""
        elif tokenizer.is_byte(i):
            piece = tokenizer.id_to_piece(i)
            if len(piece) != 6:
                print(f"Invalid token: {piece}")
                sys.exit(1)
            byte_value = int(piece[3:-1], 16)
            text = struct.pack("B", byte_value)
        else:
            text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode()
        fout.write(struct.pack("i", len(text)))
        fout.write(text)
        fout.write(struct.pack("f", tokenizer.get_score(i)))

 def process_and_write_variables(fout, model, ftype, part_id, n_parts):
    for name, datao in model.items():
        if name.endswith("freqs"):
            continue

        # remove dimensions with a single element
        data = datao.numpy().squeeze()
        partshape = data.shape
        n_dims = len(data.shape)
        assert n_dims in (1, 2)

        print(f"Processing variable: {name} with shape: {partshape} and type: {datao.dtype}")

        # coerce single-dimensional tensors from float16 to float32
        ftype_cur = 1
        if ftype == 0 or n_dims == 1:
            print("  Converting to float32")
            data = data.astype(np.float32)
            ftype_cur = 0
        blck_size = GGML_BLCK_SIZE[WTYPES[ftype_cur]]
        type_size = GGML_TYPE_SIZE[WTYPES[ftype_cur]]

        # determine dimension along which multipart tensor is sharded
        #
        # split_dim 0 regex:
        #   - output.*
        #   - layers.*.attention.wq.weight
        #   - layers.*.attention.wk.weight
        #   - layers.*.attention.wv.weight
        #   - layers.*.feed_forward.w1.weight
        #   - layers.*.feed_forward.w3.weight
        #
        # split_dim 1 regex:
        #   - tok_embeddings.*
        #   - layers.*.attention.wo.weight
        #   - layers.*.feed_forward.w2.weight
        #
        if n_dims > 1:
            split_dim = 1
            if "tok_embeddings" in name:
                split_dim = 1
            elif "layers" in name:
                if "attention.wo.weight" in name:
                    split_dim = 1
                elif "feed_forward.w2.weight" in name:
                    split_dim = 1
                else:
                    split_dim = 0
            elif "output" in name:
                split_dim = 0

        # output tensor header
        fullshape = list(partshape)
        if n_dims > 1:
            fullshape[split_dim] *= n_parts
        sname = name.encode()
        fout.write(struct.pack("iii", n_dims, len(sname), ftype_cur))
        for dim in reversed(fullshape):
            fout.write(struct.pack("i", dim))
        fout.write(sname)

        # ensure tensor data is aligned
        tensor_data_offset = fout.tell()
        while tensor_data_offset % QK != 0:
            fout.write(struct.pack("B", 0))
            tensor_data_offset += 1

        # output unified mappable tensor data
        if n_dims == 1 or n_parts == 1:
            # copy tensor which we thankfully received in one piece
            if part_id == 0:
                data.tofile(fout)
        elif split_dim == 0:
            # reassemble multifile tensor containing some of the rows
            rows_per_chunk = partshape[0]
            current_row = part_id * rows_per_chunk
            bytes_per_row = fullshape[1] // blck_size * type_size
            offset = current_row * bytes_per_row
            fout.seek(tensor_data_offset + offset)
            data.tofile(fout)
        elif split_dim == 1:
            # reassemble multifile tensor containing some of the cols
            cols_per_chunk = partshape[1]
            current_col = part_id * cols_per_chunk
            bytes_per_row = fullshape[1] // blck_size * type_size
            offset_current_col = current_col // blck_size * type_size
            for row in range(partshape[0]):
                offset_row = row * bytes_per_row
                offset = offset_row + offset_current_col
                fout.seek(tensor_data_offset + offset)
                data[row].tofile(fout)

        # advance file position to next tensor
        fout.seek(tensor_data_offset + ggml_nbytes(fullshape, ftype_cur))

 def main():
    args = parse_args()
    dir_model = args.dir_model
    ftype = args.ftype
    ftype_str = ["f32", "f16"]
    hparams, tokenizer = load_hparams_and_tokenizer(dir_model)

    print(args)

    # if only writing vocab to file
    if args.vocab_only:
        fname_model = f"{dir_model}/consolidated.00.pth"
        fname_out = f"{dir_model}/ggml-vocab.bin"
        print(f"Extracting only the vocab from '{fname_model}'\n")
        with open(fname_out, "wb") as fout:
            write_header(fout, hparams, ftype)
            write_tokens(fout, tokenizer)
        print(f"Done. Output file: {fname_out}\n")
        return

    n_parts = args.n_parts if args.n_parts is not None else get_n_parts(hparams["dim"])
    fname_out = f"{dir_model}/ggml-model-{ftype_str[ftype]}.bin"

    # we output a single file for ggml
    with open(fname_out, "wb") as fout:
        write_header(fout, hparams, ftype)
        write_tokens(fout, tokenizer)
        offset_of_tensors = fout.tell()
        # the tensors we load could be split across multiple files
        for part_id in range(n_parts):
            fout.seek(offset_of_tensors)
            print(f"Processing part {part_id+1} of {n_parts}\n")
            fname_model = f"{dir_model}/consolidated.0{part_id}.pth"
            model = torch.load(fname_model, map_location="cpu")
            process_and_write_variables(fout, model, ftype, part_id, n_parts)
            del model

    print(f"Done. Output file: {fname_out}\n")

 if __name__ == "__main__":
    main()
	# Convert a huggingface LLaMA checkpoint to an (unsharded) pytorch checkpoint
	# comes from https://github.com/tloen/alpaca-lora/blob/main/export_state_dict_checkpoint.py

	import argparse
	import json
	from pathlib import Path

	import torch
	import transformers
	from transformers import LlamaForCausalLM, LlamaTokenizer # noqa: E402

	parser = argparse.ArgumentParser()
	parser.add_argument("base_model")
	parser.add_argument("size_key")
	args = parser.parse_args()

	tokenizer = LlamaTokenizer.from_pretrained(args.base_model)

	base_model = LlamaForCausalLM.from_pretrained(
	args.base_model,
	load_in_8bit=False,
	torch_dtype=torch.float16,
	device_map={"": "cpu"},
	)

	base_model.train(False)

	base_model_sd = base_model.state_dict()

	params_by_model = {
	"7b": {
	"dim": 4096,
	"multiple_of": 256,
	"n_heads": 32,
	"n_layers": 32,
	"norm_eps": 1e-06,
	"vocab_size": -1,
	},
	"13b": {
	"dim": 5120,
	"multiple_of": 256,
	"n_heads": 40,
	"n_layers": 40,
	"norm_eps": 1e-06,
	"vocab_size": -1,
	},
	"30b": {
	"dim": 6656,
	"multiple_of": 256,
	"n_heads": 52,
	"n_layers": 60,
	"norm_eps": 1e-06,
	"vocab_size": -1,
	},
	"65b": {
	"dim": 8192,
	"multiple_of": 256,
	"n_heads": 64,
	"n_layers": 80,
	"norm_eps": 1e-06,
	"vocab_size": -1,
	},
	}
	params = params_by_model[args.size_key.lower()]
	n_layers = params["n_layers"]
	n_heads = params["n_heads"]
	dim = params["dim"]
	dims_per_head = dim // n_heads
	base = 10000.0
	inv_freq = 1.0 / (
	base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head)
	)


	def permute(w):
	return (
	w.view(n_heads, dim // n_heads // 2, 2, dim)
	.transpose(1, 2)
	.reshape(dim, dim)
	)


	def unpermute(w):
	return (
	w.view(n_heads, 2, dim // n_heads // 2, dim)
	.transpose(1, 2)
	.reshape(dim, dim)
	)


	def translate_state_dict_key(k): # noqa: C901
	k = k.replace("base_model.model.", "")
	if k == "model.embed_tokens.weight":
	return "tok_embeddings.weight"
	elif k == "model.norm.weight":
	return "norm.weight"
	elif k == "lm_head.weight":
	return "output.weight"
	elif k.startswith("model.layers."):
	layer = k.split(".")[2]
	if k.endswith(".self_attn.q_proj.weight"):
	return f"layers.{layer}.attention.wq.weight"
	elif k.endswith(".self_attn.k_proj.weight"):
	return f"layers.{layer}.attention.wk.weight"
	elif k.endswith(".self_attn.v_proj.weight"):
	return f"layers.{layer}.attention.wv.weight"
	elif k.endswith(".self_attn.o_proj.weight"):
	return f"layers.{layer}.attention.wo.weight"
	elif k.endswith(".mlp.gate_proj.weight"):
	return f"layers.{layer}.feed_forward.w1.weight"
	elif k.endswith(".mlp.down_proj.weight"):
	return f"layers.{layer}.feed_forward.w2.weight"
	elif k.endswith(".mlp.up_proj.weight"):
	return f"layers.{layer}.feed_forward.w3.weight"
	elif k.endswith(".input_layernorm.weight"):
	return f"layers.{layer}.attention_norm.weight"
	elif k.endswith(".post_attention_layernorm.weight"):
	return f"layers.{layer}.ffn_norm.weight"
	elif k.endswith("rotary_emb.inv_freq") or "lora" in k:
	return None
	else:
	print(layer, k)
	raise NotImplementedError
	else:
	print(k)
	raise NotImplementedError


	new_state_dict = {}
	for k, v in base_model_sd.items():
	new_k = translate_state_dict_key(k)
	if new_k is not None:
	if "wq" in new_k or "wk" in new_k:
	new_state_dict[new_k] = unpermute(v)
	else:
	new_state_dict[new_k] = v

	out_path = Path(args.base_model, "consolidated")
	out_path.mkdir(exist_ok=True)
	torch.save(new_state_dict, out_path / "consolidated.00.pth")

	with open(out_path / "params.json", "w") as f:
	json.dump(params, f)
	# minor modification of the original file from llama.ccp
	# to account for the unsharded checkpoint;
	# call with `convert-pth-to-ggml.py <output dir of convert-hf-to-pth.py> 1 1`

	import argparse
	import os
	import sys
	import json
	import struct
	import numpy as np
	import torch

	from sentencepiece import SentencePieceProcessor

	QK = 32

	GGML_TYPE_Q4_0 = 0
	GGML_TYPE_Q4_1 = 1
	GGML_TYPE_I8 = 2
	GGML_TYPE_I16 = 3
	GGML_TYPE_I32 = 4
	GGML_TYPE_F16 = 5
	GGML_TYPE_F32 = 6

	WTYPES = {
	0: GGML_TYPE_F32,
	1: GGML_TYPE_F16,
	2: GGML_TYPE_Q4_0,
	3: GGML_TYPE_Q4_1,
	}

	GGML_BLCK_SIZE = {
	GGML_TYPE_Q4_0: QK,
	GGML_TYPE_Q4_1: QK,
	GGML_TYPE_I8: 1,
	GGML_TYPE_I16: 1,
	GGML_TYPE_I32: 1,
	GGML_TYPE_F16: 1,
	GGML_TYPE_F32: 1,
	}

	GGML_TYPE_SIZE = {
	GGML_TYPE_Q4_0: 4 + QK//2,
	GGML_TYPE_Q4_1: 4*2 + QK//2,
	GGML_TYPE_I8: 1,
	GGML_TYPE_I16: 2,
	GGML_TYPE_I32: 4,
	GGML_TYPE_F16: 2,
	GGML_TYPE_F32: 4,
	}

	def ggml_nelements(shape):
	r = 1
	for i in shape:
	r *= i
	return r

	def ggml_nbytes(shape, ftype):
	x = ggml_nelements(shape)
	t = WTYPES[ftype]
	x *= GGML_TYPE_SIZE[t]
	x //= GGML_BLCK_SIZE[t]
	return x

	def parse_args():
	parser = argparse.ArgumentParser(description='Convert a LLaMA model checkpoint to a ggml compatible file')
	parser.add_argument('dir_model', help='directory containing the model checkpoint')
	parser.add_argument('ftype', help='file type (0: float32, 1: float16)', type=int, choices=[0, 1], default=1)
	parser.add_argument('vocab_only', help='only write vocab to file', type=int, default=0, nargs='?')
	parser.add_argument('n_parts', type=int, default=None)
	return parser.parse_args()

	def get_n_parts(dim):
	mappings = {4096: 1, 5120: 2, 6656: 4, 8192: 8}
	n_parts = mappings.get(dim)
	if n_parts is None:
	print(f"Invalid dim: {dim}")
	sys.exit(1)

	print(f"n_parts = {n_parts}\n")
	return n_parts

	def load_hparams_and_tokenizer(dir_model):
	# `dir_model` is something like `models/7B` or `models/7B/`.
	# "tokenizer.model" is expected under model's parent dir.
	# When `dir_model` is a symlink, f"{dir_model}/../tokenizer.model" would not be found.
	# Let's use the model's parent dir directly.
	model_parent_dir = os.path.dirname(os.path.normpath(dir_model))
	fname_hparams = f"{dir_model}/params.json"
	fname_tokenizer = f"{model_parent_dir}/tokenizer.model"
	with open(fname_hparams, "r") as f:
	hparams = json.load(f)
	print(hparams)
	tokenizer = SentencePieceProcessor(fname_tokenizer)
	hparams.update({"vocab_size": tokenizer.vocab_size()})
	return hparams, tokenizer

	def write_header(fout, hparams, ftype):
	keys = ["vocab_size", "dim", "multiple_of", "n_heads", "n_layers"]
	values = [
	0x67676a74, # magic: ggjt in hex
	1, # file version
	*[hparams[key] for key in keys],
	hparams["dim"] // hparams["n_heads"], # rot (obsolete)
	ftype
	]
	fout.write(struct.pack("i" * len(values), *values))

	def write_tokens(fout, tokenizer):
	for i in range(tokenizer.vocab_size()):
	if tokenizer.is_unknown(i):
	text = " \u2047 ".encode()
	elif tokenizer.is_control(i):
	text = b""
	elif tokenizer.is_byte(i):
	piece = tokenizer.id_to_piece(i)
	if len(piece) != 6:
	print(f"Invalid token: {piece}")
	sys.exit(1)
	byte_value = int(piece[3:-1], 16)
	text = struct.pack("B", byte_value)
	else:
	text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode()
	fout.write(struct.pack("i", len(text)))
	fout.write(text)
	fout.write(struct.pack("f", tokenizer.get_score(i)))

	def process_and_write_variables(fout, model, ftype, part_id, n_parts):
	for name, datao in model.items():
	if name.endswith("freqs"):
	continue

	# remove dimensions with a single element
	data = datao.numpy().squeeze()
	partshape = data.shape
	n_dims = len(data.shape)
	assert n_dims in (1, 2)

	print(f"Processing variable: {name} with shape: {partshape} and type: {datao.dtype}")

	# coerce single-dimensional tensors from float16 to float32
	ftype_cur = 1
	if ftype == 0 or n_dims == 1:
	print(" Converting to float32")
	data = data.astype(np.float32)
	ftype_cur = 0
	blck_size = GGML_BLCK_SIZE[WTYPES[ftype_cur]]
	type_size = GGML_TYPE_SIZE[WTYPES[ftype_cur]]

	# determine dimension along which multipart tensor is sharded
	#
	# split_dim 0 regex:
	# - output.*
	# - layers.*.attention.wq.weight
	# - layers.*.attention.wk.weight
	# - layers.*.attention.wv.weight
	# - layers.*.feed_forward.w1.weight
	# - layers.*.feed_forward.w3.weight
	#
	# split_dim 1 regex:
	# - tok_embeddings.*
	# - layers.*.attention.wo.weight
	# - layers.*.feed_forward.w2.weight
	#
	if n_dims > 1:
	split_dim = 1
	if "tok_embeddings" in name:
	split_dim = 1
	elif "layers" in name:
	if "attention.wo.weight" in name:
	split_dim = 1
	elif "feed_forward.w2.weight" in name:
	split_dim = 1
	else:
	split_dim = 0
	elif "output" in name:
	split_dim = 0

	# output tensor header
	fullshape = list(partshape)
	if n_dims > 1:
	fullshape[split_dim] *= n_parts
	sname = name.encode()
	fout.write(struct.pack("iii", n_dims, len(sname), ftype_cur))
	for dim in reversed(fullshape):
	fout.write(struct.pack("i", dim))
	fout.write(sname)

	# ensure tensor data is aligned
	tensor_data_offset = fout.tell()
	while tensor_data_offset % QK != 0:
	fout.write(struct.pack("B", 0))
	tensor_data_offset += 1

	# output unified mappable tensor data
	if n_dims == 1 or n_parts == 1:
	# copy tensor which we thankfully received in one piece
	if part_id == 0:
	data.tofile(fout)
	elif split_dim == 0:
	# reassemble multifile tensor containing some of the rows
	rows_per_chunk = partshape[0]
	current_row = part_id * rows_per_chunk
	bytes_per_row = fullshape[1] // blck_size * type_size
	offset = current_row * bytes_per_row
	fout.seek(tensor_data_offset + offset)
	data.tofile(fout)
	elif split_dim == 1:
	# reassemble multifile tensor containing some of the cols
	cols_per_chunk = partshape[1]
	current_col = part_id * cols_per_chunk
	bytes_per_row = fullshape[1] // blck_size * type_size
	offset_current_col = current_col // blck_size * type_size
	for row in range(partshape[0]):
	offset_row = row * bytes_per_row
	offset = offset_row + offset_current_col
	fout.seek(tensor_data_offset + offset)
	data[row].tofile(fout)

	# advance file position to next tensor
	fout.seek(tensor_data_offset + ggml_nbytes(fullshape, ftype_cur))

	def main():
	args = parse_args()
	dir_model = args.dir_model
	ftype = args.ftype
	ftype_str = ["f32", "f16"]
	hparams, tokenizer = load_hparams_and_tokenizer(dir_model)

	print(args)

	# if only writing vocab to file
	if args.vocab_only:
	fname_model = f"{dir_model}/consolidated.00.pth"
	fname_out = f"{dir_model}/ggml-vocab.bin"
	print(f"Extracting only the vocab from '{fname_model}'\n")
	with open(fname_out, "wb") as fout:
	write_header(fout, hparams, ftype)
	write_tokens(fout, tokenizer)
	print(f"Done. Output file: {fname_out}\n")
	return

	n_parts = args.n_parts if args.n_parts is not None else get_n_parts(hparams["dim"])
	fname_out = f"{dir_model}/ggml-model-{ftype_str[ftype]}.bin"

	# we output a single file for ggml
	with open(fname_out, "wb") as fout:
	write_header(fout, hparams, ftype)
	write_tokens(fout, tokenizer)
	offset_of_tensors = fout.tell()
	# the tensors we load could be split across multiple files
	for part_id in range(n_parts):
	fout.seek(offset_of_tensors)
	print(f"Processing part {part_id+1} of {n_parts}\n")
	fname_model = f"{dir_model}/consolidated.0{part_id}.pth"
	model = torch.load(fname_model, map_location="cpu")
	process_and_write_variables(fout, model, ftype, part_id, n_parts)
	del model

	print(f"Done. Output file: {fname_out}\n")

	if __name__ == "__main__":
	main()