Created
February 21, 2024 19:00
-
-
Save shreyansh26/1f7d491d6bacc83692f51048ddf16c2b to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
import sentencepiece as spm | |
import sentencepiece.sentencepiece_model_pb2 as sp_pb2 | |
from google.protobuf.json_format import MessageToDict | |
PATH = "tokenizer.model" | |
s = spm.SentencePieceProcessor(model_file=PATH) | |
dc = {} | |
# Get vocab | |
for i in range(s.vocab_size()): | |
dc[i] = s.detokenize(i) | |
with open('vocab.json', 'w') as f: | |
json.dump(dc, f, indent=4) | |
# Dump the model protobuf | |
mp = sp_pb2.ModelProto() | |
mp.ParseFromString(open(PATH, "rb").read()) | |
dict_obj = MessageToDict(mp) | |
with open('config.json', 'w') as f: | |
json.dump(dict_obj, f, indent=4) | |
# Directly print values | |
print(mp.trainer_spec) | |
print(mp.normalizer_spec) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment