import tokenizers
tokenizers .__version__
'0.8.1'
from tokenizers import (ByteLevelBPETokenizer ,
CharBPETokenizer ,
SentencePieceBPETokenizer ,
BertWordPieceTokenizer )
small_corpus = 'very_small_corpus.txt'
bert_wordpiece_tokenizer = BertWordPieceTokenizer ()
bert_wordpiece_tokenizer .train (
files = [small_corpus ],
vocab_size = 10 ,
min_frequency = 1 ,
limit_alphabet = 1000 ,
initial_alphabet = [],
special_tokens = ["[PAD]" , "[UNK]" , "[CLS]" , "[SEP]" , "[MASK]" ],
show_progress = True ,
wordpieces_prefix = "##" ,
)
vocab = bert_wordpiece_tokenizer .get_vocab ()
print (sorted (vocab , key = lambda x : vocab [x ]))
['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]', 'a', 'b', 'c', 'd', 'e', 'f', '##b', '##c', '##d', '##e', '##f']
encoding = bert_wordpiece_tokenizer .encode ('ABCDE' )
print (encoding .tokens )
print (encoding .ids )
['a', '##b', '##c', '##d', '##e']
[5, 11, 12, 13, 14]
bert_wordpiece_tokenizer .train (
files = [small_corpus ],
vocab_size = 20 ,
min_frequency = 1 ,
initial_alphabet = ['g' ],
)
vocab = bert_wordpiece_tokenizer .get_vocab ()
print (sorted (vocab , key = lambda x : vocab [x ]))
['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]', 'a', 'b', 'c', 'd', 'e', 'f', 'g', '##b', '##d', '##c', '##e', '##f', 'ab', 'abc', 'ac']
encodings = bert_wordpiece_tokenizer .encode_batch (['ABCDE' , 'abcd' ])
print (encodings [0 ].tokens )
['abc', '##d', '##e']
bert_wordpiece_tokenizer .save_model (
directory = './' ,
name = 'very_small_bertwordpiece'
)
# ['./very_small_bertwordpiece-vocab.txt']
['./very_small_bertwordpiece-vocab.txt']
bert_wordpiece_tokenizer = BertWordPieceTokenizer (
vocab_file = './very_small_bertwordpiece-vocab.txt'
)
bert_wordpiece_tokenizer .encode ('ABCDE' ).tokens
# ['[CLS]', 'abc', '##d', '##e', '[SEP]']
['[CLS]', 'abc', '##d', '##e', '[SEP]']
bert_wordpiece_tokenizer .encode ('ABCDE' , add_special_tokens = False ).tokens
['abc', '##d', '##e']
bert_wordpiece_tokenizer .encode (
sequence = 'abcde' ,
pair = 'abcd'
).tokens
['[CLS]', 'abc', '##d', '##e', '[SEP]', 'abc', '##d', '[SEP]']
bert_wordpiece_tokenizer .add_tokens (['lovit' ])
vocab = bert_wordpiece_tokenizer .get_vocab ()
print (sorted (vocab , key = lambda x : vocab [x ]))
['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]', 'a', 'b', 'c', 'd', 'e', 'f', 'g', '##b', '##d', '##c', '##e', '##f', 'ab', 'abc', 'ac', 'lovit']
bert_wordpiece_tokenizer .encode ('ABCDE abg lovit' ).tokens
['[CLS]', 'abc', '##d', '##e', '[UNK]', 'lovit', '[SEP]']
# 지금은 저장 안됨
bert_wordpiece_tokenizer = BertWordPieceTokenizer (
vocab_file = './very_small_bertwordpiece-vocab.txt'
)
vocab = bert_wordpiece_tokenizer .get_vocab ()
print (sorted (vocab , key = lambda x : vocab [x ]))
['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]', 'a', 'b', 'c', 'd', 'e', 'f', 'g', '##b', '##d', '##c', '##e', '##f', 'ab', 'abc', 'ac']
SentencePiece BPE Tokenizer
sentencepiece_tokenizer = SentencePieceBPETokenizer (
add_prefix_space = True ,
)
sentencepiece_tokenizer .train (
files = [small_corpus ],
vocab_size = 20 ,
min_frequency = 1 ,
special_tokens = ['<unk>' ],
)
vocab = sentencepiece_tokenizer .get_vocab ()
print (sorted (vocab , key = lambda x : vocab [x ]))
['<unk>', 'A', 'B', 'C', 'D', 'E', 'F', '▁', '▁A', '▁AB', '▁ABC', 'DE', '▁DE', '▁AC', '▁AF', '▁ABD', '▁ABCDE']
sentencepiece_tokenizer = SentencePieceBPETokenizer (
add_prefix_space = False
)
sentencepiece_tokenizer .train (
files = [small_corpus ],
vocab_size = 20 ,
min_frequency = 1 ,
special_tokens = ['<unk>' , 'lovit' ],
)
vocab = sentencepiece_tokenizer .get_vocab ()
print (sorted (vocab , key = lambda x : vocab [x ]))
['<unk>', 'lovit', 'A', 'B', 'C', 'D', 'E', 'F', '▁', '▁A', '▁AB', 'DE', '▁ABC', 'AB', 'CDE', '▁AC', '▁AF', '▁ABD', 'ABCDE']
sentencepiece_tokenizer .save_model ('./' , 'very_small_sentencepiece' )
['./very_small_sentencepiece-vocab.json',
'./very_small_sentencepiece-merges.txt']
sentencepiece_tokenizer = SentencePieceBPETokenizer (
vocab_file = './very_small_sentencepiece-vocab.json' ,
merges_file = './very_small_sentencepiece-merges.txt'
)
sentencepiece_tokenizer .encode ('ABCDE' ).tokens
['▁ABC', 'DE']
sentencepiece_tokenizer .encode ('ABCDE abc lovit' ).tokens
['▁ABC',
'DE',
'▁',
'<unk>',
'<unk>',
'<unk>',
'▁',
'<unk>',
'<unk>',
'<unk>',
'<unk>',
'<unk>']
charbpe_tokenizer = CharBPETokenizer (suffix = '</w>' )
charbpe_tokenizer .train (
files = [small_corpus ],
vocab_size = 15 ,
min_frequency = 1
)
charbpe_tokenizer .encode ('ABCDE.ABC' ).tokens
['AB', 'C', 'DE</w>', 'ABC</w>']
charbpe_tokenizer = CharBPETokenizer (
suffix = '</w>' ,
split_on_whitespace_only = True
)
charbpe_tokenizer .train (
files = [small_corpus ],
vocab_size = 15 ,
min_frequency = 1
)
charbpe_tokenizer .encode ('ABCDE.ABC' ).tokens
['AB', 'C', 'D', 'E', 'ABC</w>']
charbpe_tokenizer = CharBPETokenizer (
suffix = '</w>' ,
split_on_whitespace_only = True ,
unk_token =
)
charbpe_tokenizer .train (
files = [small_corpus ],
vocab_size = 15 ,
min_frequency = 1
)
charbpe_tokenizer .encode ('ABCDE.ABC' ).tokens
File "<ipython-input-21-ff7b1b8090ee>", line 5
)
^
SyntaxError: invalid syntax
charbpe_tokenizer .encode ('ABCDEFGH' ).tokens
['AB', 'C', 'D', 'E', 'F']
# OpenAI GPT2 tokenizer
bytebpe_tokenizer = ByteLevelBPETokenizer (
add_prefix_space = False ,
lowercase = False ,
)
bytebpe_tokenizer .train (
files = [small_corpus ],
vocab_size = 1000 ,
min_frequency = 1
)
vocab = bytebpe_tokenizer .get_vocab ()
print (sorted (vocab , key = lambda x : vocab [x ]))
['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '=', '>', '?', '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', '\\', ']', '^', '_', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '|', '}', '~', '¡', '¢', '£', '¤', '¥', '¦', '§', '¨', '©', 'ª', '«', '¬', '®', '¯', '°', '±', '²', '³', '´', 'µ', '¶', '·', '¸', '¹', 'º', '»', '¼', '½', '¾', '¿', 'À', 'Á', 'Â', 'Ã', 'Ä', 'Å', 'Æ', 'Ç', 'È', 'É', 'Ê', 'Ë', 'Ì', 'Í', 'Î', 'Ï', 'Ð', 'Ñ', 'Ò', 'Ó', 'Ô', 'Õ', 'Ö', '×', 'Ø', 'Ù', 'Ú', 'Û', 'Ü', 'Ý', 'Þ', 'ß', 'à', 'á', 'â', 'ã', 'ä', 'å', 'æ', 'ç', 'è', 'é', 'ê', 'ë', 'ì', 'í', 'î', 'ï', 'ð', 'ñ', 'ò', 'ó', 'ô', 'õ', 'ö', '÷', 'ø', 'ù', 'ú', 'û', 'ü', 'ý', 'þ', 'ÿ', 'Ā', 'ā', 'Ă', 'ă', 'Ą', 'ą', 'Ć', 'ć', 'Ĉ', 'ĉ', 'Ċ', 'ċ', 'Č', 'č', 'Ď', 'ď', 'Đ', 'đ', 'Ē', 'ē', 'Ĕ', 'ĕ', 'Ė', 'ė', 'Ę', 'ę', 'Ě', 'ě', 'Ĝ', 'ĝ', 'Ğ', 'ğ', 'Ġ', 'ġ', 'Ģ', 'ģ', 'Ĥ', 'ĥ', 'Ħ', 'ħ', 'Ĩ', 'ĩ', 'Ī', 'ī', 'Ĭ', 'ĭ', 'Į', 'į', 'İ', 'ı', 'IJ', 'ij', 'Ĵ', 'ĵ', 'Ķ', 'ķ', 'ĸ', 'Ĺ', 'ĺ', 'Ļ', 'ļ', 'Ľ', 'ľ', 'Ŀ', 'ŀ', 'Ł', 'ł', 'Ń', 'ĠA', 'ĠAB', 'DE', 'ĠABC', 'AB', 'CDE', 'ĠAC', 'ĠAF', 'ĠABD', 'ABCDE']
bytebpe_tokenizer .encode ('ABCDE ABC' ).tokens
['ABCDE', 'ĠABC']
from tokenizers import (ByteLevelBPETokenizer ,
CharBPETokenizer ,
SentencePieceBPETokenizer ,
BertWordPieceTokenizer )
corpus_path = '../data/2020-07-29_covid_news_sents.txt'
add_prefix_space = True
vocab_size = 3000
byte_level_bpe_tokenizer = ByteLevelBPETokenizer ()
byte_level_bpe_tokenizer .train (files = [corpus_path ], vocab_size = vocab_size )
byte_level_bpe_tokenizer .save_model (directory = './tokenizers/ByteLevelBPETokenizer/' , name = 'covid' )
# ['./tokenizers/ByteLevelBPETokenizer/covid-vocab.json',
# './tokenizers/ByteLevelBPETokenizer/covid-merges.txt']
['./tokenizers/ByteLevelBPETokenizer/covid-vocab.json',
'./tokenizers/ByteLevelBPETokenizer/covid-merges.txt']
char_bpe_tokenizer = CharBPETokenizer ()
char_bpe_tokenizer .train (files = [corpus_path ], vocab_size = vocab_size )
char_bpe_tokenizer .save_model (directory = './tokenizers/CharBPETokenizer/' , name = 'covid' )
# ['./tokenizers/CharBPETokenizer/covid-vocab.json',
# './tokenizers/CharBPETokenizer/covid-merges.txt']
['./tokenizers/CharBPETokenizer/covid-vocab.json',
'./tokenizers/CharBPETokenizer/covid-merges.txt']
sentencepiece_bpe_tokenizer = SentencePieceBPETokenizer ()
sentencepiece_bpe_tokenizer .train (files = [corpus_path ], vocab_size = vocab_size )
sentencepiece_bpe_tokenizer .save_model (directory = './tokenizers/SentencePieceBPETokenizer/' , name = 'covid' )
# ['./tokenizers/SentencePieceBPETokenizer/covid-vocab.json',
# './tokenizers/SentencePieceBPETokenizer/covid-merges.txt']
['./tokenizers/SentencePieceBPETokenizer/covid-vocab.json',
'./tokenizers/SentencePieceBPETokenizer/covid-merges.txt']
bert_wordpiece_tokenizer = BertWordPieceTokenizer ()
bert_wordpiece_tokenizer .train (files = [corpus_path ], vocab_size = vocab_size )
bert_wordpiece_tokenizer .save_model (directory = './tokenizers/BertWordPieceTokenizer/' , name = 'covid' )
# ['./tokenizers/BertWordPieceTokenizer/covid-vocab.txt']
['./tokenizers/BertWordPieceTokenizer/covid-vocab.txt']
sent_ko = '신종 코로나바이러스 감염증(코로나19) 사태가 심각합니다'
tokenizers = [bert_wordpiece_tokenizer ,
sentencepiece_bpe_tokenizer ,
char_bpe_tokenizer ,
byte_level_bpe_tokenizer ]
for tokenizer in tokenizers :
encode_single = tokenizer .encode (sent_ko )
print (f'\n { tokenizer .__class__ .__name__ } ' )
print (f'tokens = { encode_single .tokens } ' )
print (f'tokens = { encode_single .ids } ' )
BertWordPieceTokenizer
tokens = ['신종', '코로나바이러스', '감염증', '(', '코로나19', ')', '사태', '##가', '심', '##각', '##합니다']
tokens = [1264, 1275, 1296, 12, 901, 13, 1605, 817, 1561, 1208, 2571]
SentencePieceBPETokenizer
tokens = ['▁신종', '▁코로나바이러스', '▁감염증(코로나19)', '▁사태', '가', '▁심', '각', '합', '니다']
tokens = [1246, 1235, 1275, 1493, 113, 1469, 114, 945, 2633]
CharBPETokenizer
tokens = ['신종</w>', '코로나바이러스</w>', '감염증</w>', '(</w>', '코로나19</w>', ')</w>', '사태', '가</w>', '심', '각', '합니다</w>']
tokens = [1946, 1956, 1948, 1843, 1884, 1821, 2198, 1014, 589, 115, 2480]
ByteLevelBPETokenizer
tokens = ['ìĭłì¢ħ', 'Ġì½Ķë¡ľëĤĺë°ĶìĿ´ë٬ìĬ¤', 'Ġê°IJìĹ¼ì¦Ŀ', '(', 'ì½Ķë¡ľëĤĺ', '19', ')', 'ĠìĤ¬íĥľ', 'ê°Ģ', 'Ġìĭ¬', 'ê°ģ', 'íķ©ëĭĪëĭ¤']
tokens = [2472, 875, 898, 7, 616, 397, 8, 1233, 291, 1235, 784, 2247]
학습한 토크나이저를 transformers 에서 이용하자
from transformers import BertTokenizer , GPT2Tokenizer
transformers_bert_tokenizer = BertTokenizer (
vocab_file = './tokenizers/BertWordPieceTokenizer/covid-vocab.txt'
)
print (f'tokenizers : { bert_wordpiece_tokenizer .encode (sent_ko ).tokens } ' )
print (f'transformers: { transformers_bert_tokenizer .tokenize (sent_ko )} ' )
tokenizers : ['신종', '코로나바이러스', '감염증', '(', '코로나19', ')', '사태', '##가', '심', '##각', '##합니다']
transformers: ['신종', '코로나바이러스', '감염증', '(', '코로나19', ')', '사태', '##가', '심', '##각', '##합니다']
from unicodedata import normalize
print (normalize ('NFKD' , '가감' )) # 가감 ; 출력 시 글자를 재조합해서 보여줌
print (len (normalize ('NFKD' , '가감' ))) # 5
print (normalize ('NFKC' , normalize ('NFKD' , '가감' ))) # 가감
print (len (normalize ('NFKC' , normalize ('NFKD' , '가감' )))) # 2
가감
5
가감
2
def compose (tokens ):
return [normalize ('NFKC' , token ) for token in tokens ]
print (f'tokenizers : { compose (bert_wordpiece_tokenizer .encode (sent_ko ).tokens )} ' )
print (f'transformers: { compose (transformers_bert_tokenizer .tokenize (sent_ko ))} ' )
tokenizers : ['신종', '코로나바이러스', '감염증', '(', '코로나19', ')', '사태', '##가', '심', '##각', '##합니다']
transformers: ['신종', '코로나바이러스', '감염증', '(', '코로나19', ')', '사태', '##가', '심', '##각', '##합니다']
transformers_gpt2_tokenizer = GPT2Tokenizer (
vocab_file = './tokenizers/ByteLevelBPETokenizer/covid-vocab.json' ,
merges_file = './tokenizers/ByteLevelBPETokenizer/covid-merges.txt'
)
print (f'tokenizers : { byte_level_bpe_tokenizer .encode (sent_ko ).tokens } ' )
print (f'transformers: { transformers_gpt2_tokenizer .tokenize (sent_ko )} ' )
tokenizers : ['ìĭłì¢ħ', 'Ġì½Ķë¡ľëĤĺë°ĶìĿ´ë٬ìĬ¤', 'Ġê°IJìĹ¼ì¦Ŀ', '(', 'ì½Ķë¡ľëĤĺ', '19', ')', 'ĠìĤ¬íĥľ', 'ê°Ģ', 'Ġìĭ¬', 'ê°ģ', 'íķ©ëĭĪëĭ¤']
transformers: ['ìĭłì¢ħ', 'Ġì½Ķë¡ľëĤĺë°ĶìĿ´ë٬ìĬ¤', 'Ġê°IJìĹ¼ì¦Ŀ', '(', 'ì½Ķë¡ľëĤĺ', '19', ')', 'ĠìĤ¬íĥľ', 'ê°Ģ', 'Ġìĭ¬', 'ê°ģ', 'íķ©ëĭĪëĭ¤']
print (compose (transformers_bert_tokenizer .tokenize ('lovit 이란 이름은 인식을 안합니다' )))
print (compose (transformers_bert_tokenizer .tokenize ('lovit 이란 이름은 인식을 안했어' )))
['l', '##o', '##v', '##i', '##t', '이라', '##ᆫ', '이', '##름', '##은', '인', '##식을', '안', '##합니다']
['l', '##o', '##v', '##i', '##t', '이라', '##ᆫ', '이', '##름', '##은', '인', '##식을', '안', '##했', '##어']
질문있습니다. 위 설명 중에서, 코로나 19 관련 뉴스를 학습해 보자 부분에서요.. BertWordPieceTokenizer를 제외한 나머지 세개의 Tokernizer의 save_model 의 결과로 covid-vocab.json 과 covid-merges.txt 파일 두가지가 생성되는 것 같습니다. 파일명으로 유추해볼때, covid-vocab.json은 단어사전관련 json 파일 인 것 같은데, covid-merges.txt는 어떤 파일인지 궁금합니다. 답변에 미리 감사드립니다.