from gguf.gguf_reader import GGUFReader
def get_string_array_field(gguf_reader: GGUFReader, key: str):
f = gguf_reader.get_field(key)
return [bytes(f.parts[d]).decode("utf-8") for d in f.data]
def get_field(gguf_reader: GGUFReader, key: str):
f = gguf_reader.get_field(key)
if f is None:
return 0
if len(f.data) != 1:
raise NotImplementedError(f"multiple data is not supported")
part = f.parts[f.data[0]]
if len(part) != 1:
raise NotImplementedError(f"multiple parts are not supported")
value = part[0]
if isinstance(value, np.float32):
return float(value)
elif isinstance(value, np.uint32):
return int(value)
return value
gguf_path = "/path/to/mistral-7b-v0.1.Q6_K.gguf"
gguf_reader = GGUFReader(str(gguf_path))
tokens = get_string_array_field(gguf_reader, "tokenizer.ggml.tokens")
# merges = get_string_array_field(gguf_reader, "tokenizer.ggml.merges")
# merges = [tuple(m.split(" ")) for m in merges]
scores = [p.item() for p in gguf_reader.get_field("tokenizer.ggml.scores").parts[5:]] # why 5!?
token_types = [p.item() for p in gguf_reader.get_field("tokenizer.ggml.token_type").parts[5:]]
import sentencepiece as spm
import sentencepiece.sentencepiece_model_pb2 as model
normalizer_spec = model.NormalizerSpec(
name="identity",
precompiled_charsmap=b"",
add_dummy_prefix=True,
remove_extra_whitespaces=False,
normalization_rule_tsv=b"",
)
trainer_spec = model.TrainerSpec(
model_type="BPE",
vocab_size=32000,
input_format="text",
split_by_unicode_script=True,
split_by_whitespace=True,
split_by_number=True,
treat_whitespace_as_suffix=False,
split_digits=True,
allow_whitespace_only_pieces=True,
vocabulary_output_piece_score=True,
byte_fallback=True,
unk_id=0,
bos_id=1,
eos_id=2,
pad_id=-1,
unk_piece="<unk>",
bos_piece="<s>",
eos_piece="</s>",
pad_piece="<pad>",
pretokenization_delimiter="",
)
m = model.ModelProto(trainer_spec=trainer_spec, normalizer_spec=normalizer_spec)
for token, score, token_type in zip(tokens, scores, token_types):
m.pieces.append(model.ModelProto.SentencePiece(piece=token, score=score, type=token_type))
tokenizer = spm.SentencePieceProcessor(model_proto=m.SerializeToString())
tokenizer.encode("how tall is obama")
[910, 9369, 349, 818, 2786]
tokenizer.decode([910, 9369, 349, 818, 2786])
'how tall is obama'
tokenizer.id_to_piece(910)
'▁how'