Skip to content

Instantly share code, notes, and snippets.

@jbochi
Created January 21, 2024 21:20
Show Gist options
  • Save jbochi/5a17f415072085a3d78deb9b1a205a21 to your computer and use it in GitHub Desktop.
Save jbochi/5a17f415072085a3d78deb9b1a205a21 to your computer and use it in GitHub Desktop.
sentence piece model from gguf
from gguf.gguf_reader import GGUFReader
def get_string_array_field(gguf_reader: GGUFReader, key: str):
    f = gguf_reader.get_field(key)
    return [bytes(f.parts[d]).decode("utf-8") for d in f.data]

def get_field(gguf_reader: GGUFReader, key: str):
    f = gguf_reader.get_field(key)
    if f is None:
        return 0
    if len(f.data) != 1:
        raise NotImplementedError(f"multiple data is not supported")
    part = f.parts[f.data[0]]
    if len(part) != 1:
        raise NotImplementedError(f"multiple parts are not supported")
    value = part[0]
    if isinstance(value, np.float32):
        return float(value)
    elif isinstance(value, np.uint32):
        return int(value)
    return value
gguf_path = "/path/to/mistral-7b-v0.1.Q6_K.gguf"
gguf_reader = GGUFReader(str(gguf_path))
tokens = get_string_array_field(gguf_reader, "tokenizer.ggml.tokens")
# merges = get_string_array_field(gguf_reader, "tokenizer.ggml.merges")
# merges = [tuple(m.split(" ")) for m in merges]
scores = [p.item() for p in gguf_reader.get_field("tokenizer.ggml.scores").parts[5:]]  #  why 5!?
token_types = [p.item() for p in gguf_reader.get_field("tokenizer.ggml.token_type").parts[5:]]
import sentencepiece as spm
import sentencepiece.sentencepiece_model_pb2 as model
normalizer_spec = model.NormalizerSpec(
    name="identity",
    precompiled_charsmap=b"",
    add_dummy_prefix=True,
    remove_extra_whitespaces=False,
    normalization_rule_tsv=b"",
)
trainer_spec = model.TrainerSpec(
  model_type="BPE",
  vocab_size=32000,
  input_format="text",
  split_by_unicode_script=True,
  split_by_whitespace=True,
  split_by_number=True,
  treat_whitespace_as_suffix=False,
  split_digits=True,
  allow_whitespace_only_pieces=True,
  vocabulary_output_piece_score=True,
  byte_fallback=True,
  unk_id=0,
  bos_id=1,
  eos_id=2,
  pad_id=-1,
  unk_piece="<unk>",
  bos_piece="<s>",
  eos_piece="</s>",
  pad_piece="<pad>",
  pretokenization_delimiter="",
)
m = model.ModelProto(trainer_spec=trainer_spec, normalizer_spec=normalizer_spec)
for token, score, token_type in zip(tokens, scores, token_types):
    m.pieces.append(model.ModelProto.SentencePiece(piece=token, score=score, type=token_type))
tokenizer = spm.SentencePieceProcessor(model_proto=m.SerializeToString())
tokenizer.encode("how tall is obama")
[910, 9369, 349, 818, 2786]
tokenizer.decode([910, 9369, 349, 818, 2786])
'how tall is obama'
tokenizer.id_to_piece(910)
'▁how'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment