Skip to content

Instantly share code, notes, and snippets.

@p208p2002
Last active September 30, 2024 08:05
Show Gist options
  • Save p208p2002/46d509cea8c7fdb9982596115818f7d4 to your computer and use it in GitHub Desktop.
Save p208p2002/46d509cea8c7fdb9982596115818f7d4 to your computer and use it in GitHub Desktop.
# GPT2 BPE-Tokenizer token 轉 utf-8 處理
# 轉換僅針對不在詞表內,以bytes形式表達的token(如中文字)
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt2")
word = "台"
tokens = tokenizer.convert_ids_to_tokens(tokenizer(word,add_special_tokens=False)["input_ids"])
print("tokens:",tokens)
# 轉 utf-8
mid_token_offset = 162 # 非頭尾位置的token有一個額外的位置標記
tokens = "".join(tokens) # 合併
integer_unicode_x = [ord(t) for t in tokens] # 文字轉對應unicode(十進位表示)
for idx,int_x in enumerate(integer_unicode_x):
if idx == 0 or idx == len(integer_unicode_x) - 1:
hex_x = format(int_x,"x")
print(f"\\x{hex_x}") # 十進位轉十六進位
else:
hex_x = format(int_x-mid_token_offset,"x") # 扣除offset
print(f"\\x{hex_x}")
print(word.encode("utf-8")) # 與迴圈印出的內容相符
# tokens: ['åı', '°']
# \xe5
# \x8f
# \xb0
# b'\xe5\x8f\xb0'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment