Last active
January 5, 2021 07:52
-
-
Save kzinmr/2fd4c90370f11cf51ba55f960f909cdc to your computer and use it in GitHub Desktop.
Fast/Slow tokenizers with huggingface/tokenizers
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import unicodedata | |
from typing import Dict, List, Optional, Union | |
import MeCab | |
from tokenizers import ( | |
AddedToken, | |
BertWordPieceTokenizer, | |
Encoding, | |
EncodeInput, | |
InputSequence, | |
) | |
# from tokenizers.pre_tokenizers import BertPreTokenizer, Sequence | |
class MecabPreTokenizer: | |
""" PreTokenizerを継承することはできない """ | |
def __init__( | |
self, | |
mecab_dict_path: Optional[str] = None, | |
do_lower_case: bool = False, | |
space_replacement: Optional[str] = None, | |
): | |
"""Constructs a MecabPreTokenizer for huggingface tokenizers. | |
- space_replacement: Character which is replaced with spaces. | |
You might want to use it because MeCab drop spaces by default. | |
This can be used to preserve spaces by replacing them with spaces later. | |
Special characters like '_' are used sometimes. | |
""" | |
self.do_lower_case = do_lower_case | |
self.space_replacement = space_replacement | |
mecab_option = ( | |
f"-Owakati -d {mecab_dict_path}" | |
if mecab_dict_path is not None | |
else "-Owakati" | |
) | |
self.mecab = MeCab.Tagger(mecab_option) | |
def __call__(self, text: str): | |
return self.pre_tokenize_str(text) | |
def pre_tokenize_str(self, sequence: str) -> str: | |
""" | |
Pre tokenize the given string | |
This method provides a way to visualize the effect of a | |
:class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the | |
alignment, nor does it provide all the capabilities of the | |
:class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use | |
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize` | |
Args: | |
sequence (:obj:`str`): | |
A string to pre-tokeize | |
Returns: | |
:obj:`List[Tuple[str, Offsets]]`: | |
A list of tuple with the pre-tokenized parts and their offsets | |
""" | |
text = unicodedata.normalize("NFKC", sequence) | |
if self.do_lower_case: | |
text = text.lower() | |
if self.space_replacement: | |
text = text.replace(" ", self.space_replacement) | |
return self.mecab.parse(text).strip() | |
class MecabBertWordPieceTokenizer(BertWordPieceTokenizer): | |
"""fast tokenizer""" | |
def __init__( | |
self, | |
vocab: Optional[Union[str, Dict[str, int]]] = None, | |
unk_token: Union[str, AddedToken] = "[UNK]", | |
sep_token: Union[str, AddedToken] = "[SEP]", | |
cls_token: Union[str, AddedToken] = "[CLS]", | |
pad_token: Union[str, AddedToken] = "[PAD]", | |
mask_token: Union[str, AddedToken] = "[MASK]", | |
clean_text: bool = True, | |
handle_chinese_chars: bool = False, # for ja | |
strip_accents: bool = False, # for ja | |
lowercase: bool = True, | |
wordpieces_prefix: str = "##", | |
mecab_dict_path: Optional[str] = None, | |
space_replacement: Optional[str] = None, | |
): | |
"""vocab: vocab.txt for WordPiece""" | |
super().__init__( | |
vocab=vocab, | |
unk_token=unk_token, | |
sep_token=sep_token, | |
cls_token=cls_token, | |
pad_token=pad_token, | |
mask_token=mask_token, | |
clean_text=clean_text, | |
handle_chinese_chars=handle_chinese_chars, | |
strip_accents=strip_accents, | |
lowercase=lowercase, | |
wordpieces_prefix=wordpieces_prefix, | |
) | |
# self.pre_tokenizer = Sequence([ | |
# MecabPreTokenizer( | |
# mecab_dict_path, lowercase, space_replacement | |
# ), | |
# BertPreTokenizer() | |
# ]) | |
self.mecab_pretok = MecabPreTokenizer( | |
mecab_dict_path, lowercase, space_replacement | |
) | |
def encode( | |
self, | |
sequence: InputSequence, | |
pair: Optional[InputSequence] = None, | |
is_pretokenized: bool = False, | |
add_special_tokens: bool = True, | |
) -> Encoding: | |
if not is_pretokenized: | |
sequence = self.mecab_pretok(sequence) | |
return super().encode(sequence, pair, is_pretokenized, add_special_tokens) | |
def encode_batch( | |
self, | |
inputs: List[EncodeInput], | |
is_pretokenized: bool = False, | |
add_special_tokens: bool = True, | |
) -> List[Encoding]: | |
if not is_pretokenized: | |
# NOTE: reject Tuple[List[str], str] pattern like | |
# ([ "A", "pre", "tokenized", "sequence" ], "And its pair") | |
inputs = [ | |
self.mecab_pretok(sequence) | |
if isinstance(sequence, str) | |
else tuple(map(self.mecab_pretok, sequence)) | |
for sequence in inputs | |
] | |
return super().encode_batch(inputs, is_pretokenized, add_special_tokens) | |
from collections import OrderedDict | |
from transformers import BertTokenizer, WordpieceTokenizer | |
from transformers.tokenization_bert import load_vocab | |
class MecabBertTokenizer(BertTokenizer): | |
"""slow tokenizer""" | |
def __init__( | |
self, | |
vocab_file: str, | |
mecab_dict_path: Optional[str] = None, | |
do_lower_case: bool = False, | |
space_replacement: Optional[str] = None, | |
**kwargs, | |
): | |
super().__init__(**kwargs) | |
self.mecab_pretok = MecabPreTokenizer( | |
mecab_dict_path, do_lower_case, space_replacement | |
) | |
self.space_replacement = space_replacement | |
self.vocab = load_vocab(vocab_file) | |
self.ids_to_tokens = OrderedDict( | |
[(ids, tok) for tok, ids in self.vocab.items()] | |
) | |
self.wordpiece_tokenizer = WordpieceTokenizer( | |
vocab=self.vocab, unk_token=self.unk_token | |
) | |
def _tokenize(self, text: str) -> List[str]: | |
pretokenized_text = self.mecab_pretok(text) | |
tokens = pretokenized_text.split(" ") | |
split_tokens = [ | |
sub_token | |
if self.space_replacement is None | |
else sub_token.replace(self.space_replacement, " ") | |
for token in tokens | |
for sub_token in self.wordpiece_tokenizer.tokenize(token) | |
] | |
return split_tokens |
Author
kzinmr
commented
Jan 5, 2021
•
Make Tagger able to be pickled. See. https://tma15.github.io/blog/2020/11/22/pythonmecab%E3%81%AEtagger%E3%82%AA%E3%83%96%E3%82%B8%E3%82%A7%E3%82%AF%E3%83%88%E3%82%92%E6%8C%81%E3%81%A4%E5%8D%98%E8%AA%9E%E5%88%86%E5%89%B2%E5%99%A8%E3%82%92pickle%E3%81%A7%E4%BF%9D%E5%AD%98%E3%81%99%E3%82%8B%E6%96%B9%E6%B3%95/
class PicklableTagger:
def __init__(self, mecab_option: str):
self.option = mecab_option
self.tagger = MeCab.Tagger(mecab_option)
def __getstate__(self):
return {'option': self.option}
def __setstate__(self, state):
for k, v in state.items():
setattr(self, k, v)
def __getnewargs__(self):
return self.option,
def __reduce_ex__(self, proto):
func = PicklableTagger
args = self.__getnewargs__()
state = self.__getstate__()
listitems = None
dictitems = None
rv = (func, args, state, listitems, dictitems)
return rv
def __call__(self, text):
return self.tagger.parse(text).rstrip()
class MecabPreTokenizer:
""" PreTokenizerを継承することはできない """
def __init__(
self,
mecab_dict_path: Optional[str] = None,
do_lower_case: bool = False,
space_replacement: Optional[str] = None,
):
"""Constructs a MecabPreTokenizer for huggingface tokenizers.
- space_replacement: Character which is replaced with spaces.
You might want to use it because MeCab drop spaces by default.
This can be used to preserve spaces by replacing them with spaces later.
Special characters like '_' are used sometimes.
"""
self.do_lower_case = do_lower_case
self.space_replacement = space_replacement
mecab_option = (
f"-Owakati -d {mecab_dict_path}"
if mecab_dict_path is not None
else "-Owakati"
)
self.mecab = PicklableTagger(mecab_option)
def __call__(self, text: str):
return self.pre_tokenize_str(text)
def pre_tokenize_str(self, sequence: str) -> str:
"""
Pre tokenize the given string
This method provides a way to visualize the effect of a
:class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
alignment, nor does it provide all the capabilities of the
:class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
Args:
sequence (:obj:`str`):
A string to pre-tokeize
Returns:
:obj:`List[Tuple[str, Offsets]]`:
A list of tuple with the pre-tokenized parts and their offsets
"""
text = unicodedata.normalize("NFKC", sequence)
if self.do_lower_case:
text = text.lower()
if self.space_replacement:
text = text.replace(" ", self.space_replacement)
return self.mecab(text)
- Load
tokenizers.Tokenizer
:
import json
import os
import tempfile
from tokenizers import Tokenizer
def load_tokenizer(tokenizer_file: str) -> Tokenizer:
""" Load BertWordPieceTokenizer from tokenizer.json.
This is necessary due to the following reasons:
- BertWordPieceTokenizer cannot load from tokenizer.json via .from_file() method
- Tokenizer.from_file(tokenizer_file) cannot be used because MecabPretokenizer is not a valid native PreTokenizer.
"""
with open(tokenizer_file) as fp:
jd = json.loads(fp.read())
settings = jd['normalizer']
settings.pop('type')
vocab_map = jd['model']['vocab']
with tempfile.TemporaryDirectory() as dname:
vocab_file = os.path.join(dname, "vocab.txt")
with open(vocab_file, 'w') as fp:
fp.write('\n'.join([w for w, vid in sorted(vocab_map.items(), key=lambda x: x[1])]))
tokenizer = MecabBertWordPieceTokenizer(vocab_file, **settings)
return tokenizer
- Load
transformers.PreTrainedTokenizerFast
from custom tokenizer(MecabBertWordPieceTokenizer
here):
import json
import os
import tempfile
from transformers import AutoTokenizer, PreTrainedTokenizerFast
def load_pretrained_tokenizer(tokenizer_file: str, cache_dir: Optional[str] = None) -> PreTrainedTokenizerFast:
""" Load BertWordPieceTokenizer from tokenizer.json.
This is necessary due to the following reasons:
- BertWordPieceTokenizer cannot load from tokenizer.json via .from_file() method
- Tokenizer.from_file(tokenizer_file) cannot be used because MecabPretokenizer is not a valid native PreTokenizer.
"""
with open(tokenizer_file) as fp:
jd = json.loads(fp.read())
settings = jd['normalizer']
settings.pop('type')
vocab_map = jd['model']['vocab']
with tempfile.TemporaryDirectory() as dname:
vocab_file = os.path.join(dname, "vocab.txt")
with open(vocab_file, 'w') as fp:
fp.write('\n'.join([w for w, vid in sorted(vocab_map.items(), key=lambda x: x[1])]))
tokenizer = MecabBertWordPieceTokenizer(vocab_file, **settings)
tokenizer_dir = os.path.dirname(tokenizer_file)
pt_tokenizer: PreTrainedTokenizerFast = AutoTokenizer.from_pretrained(
tokenizer_dir,
cache_dir=cache_dir,
)
# This is necessary for pt_tokenizer.save_pretrained(save_path)
pt_tokenizer._tokenizer = tokenizer._tokenizer
return pt_tokenizer
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment