Last active
August 2, 2018 04:55
-
-
Save tuxedocat/f0ec90858d6b54b94f0b80c66802ce3f to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from janome.tokenizer import Tokenizer | |
from janome.analyzer import Analyzer | |
from janome.charfilter import * | |
from janome.tokenfilter import * | |
from typing import * | |
import pickle | |
class TokenizerFactory: | |
def __init__(self, token_filters=('compound', 'pos', 'lowercase'), exclude_pos=('記号', '助詞')): | |
self.token_filters = token_filters[:] | |
char_filters = [UnicodeNormalizeCharFilter()] | |
tokenizer = Tokenizer(mmap=False) | |
token_filters = [] | |
if 'compound' in self.token_filters: | |
token_filters.append(CompoundNounFilter()) | |
if 'pos' in self.token_filters: | |
if exclude_pos is not None: | |
token_filters.append(POSStopFilter(list(exclude_pos))) | |
if 'lowercase' in self.token_filters: | |
token_filters.append(LowerCaseFilter()) | |
self.analyzer_ = Analyzer(char_filters, tokenizer, token_filters) | |
self.tokenizer_ = None | |
def _base(self, s: str) -> List[str]: | |
return [t.base_form for t in self.analyzer_.analyze(s)] | |
def _surface(self, s: str) -> List[str]: | |
return [t.surface for t in self.analyzer_.analyze(s)] | |
def _yomi(self, s: str) -> List[str]: | |
return [t.reading for t in self.analyzer_.analyze(s)] | |
def _phonetic(self, s: str) -> List[str]: | |
return [t.phonetic for t in self.analyzer_.analyze(s)] | |
def _base_with_pos(self, s: str) -> List[str]: | |
return [f'{t.base_form}/{t.part_of_speech}' for t in self.analyzer_.analyze(s)] | |
def _surface_with_pos(self, s: str) -> List[str]: | |
return [f'{t.surface}/{t.part_of_speech}' for t in self.analyzer_.analyze(s)] | |
def __call__(self, tokenization_type='surface'): | |
if tokenization_type == 'base': | |
self.tokenizer_ = self._base | |
elif tokenization_type == 'surface': | |
self.tokenizer_ = self._surface | |
elif tokenization_type == 'yomi': | |
self.tokenizer_ = self._yomi | |
elif tokenization_type == 'phonetic': | |
self.tokenizer_ = self._phonetic | |
else: | |
raise NotImplementedError( | |
f'Tokenization type {tokenization_type} is not supported.') | |
return self.tokenizer_ | |
def __reduce_ex__(self, protocol): | |
return type(self), ('',) | |
tokenizer_func = TokenizerFactory()() | |
unpickled = pickle.loads(pickle.dumps(tokenizer_func)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment