Skip to content

Instantly share code, notes, and snippets.

@altescy
Last active January 16, 2023 04:53
Show Gist options
  • Select an option

  • Save altescy/375307e2bd6a863bf6972861433da135 to your computer and use it in GitHub Desktop.

Select an option

Save altescy/375307e2bd6a863bf6972861433da135 to your computer and use it in GitHub Desktop.
import abc
import functools
from os import PathLike
from typing import Any, Callable, Dict, List, NamedTuple, Optional, Union
import fugashi
import minato
class Token(NamedTuple):
surface: str
start: int
end: int
postag: Optional[str] = None
lemma: Optional[str] = None
class Tokenizer(abc.ABC):
@abc.abstractmethod
def tokenize(self, text: str) -> List[Token]:
raise NotImplementedError
class FugashiTokenizer(Tokenizer):
parse_feature: Callable[[fugashi.fugashi.Node, int], Token]
def __init__(
self,
system_dictionary_path: Optional[Union[str, PathLike]] = None,
user_dictionary_path: Optional[Union[str, PathLike]] = None,
) -> None:
self._system_dictionary_path = system_dictionary_path or "unidic-lite"
self._user_dictionary_path = user_dictionary_path
def __getstate__(self) -> Dict[str, Any]:
return {
"system_dictionary_path": self._system_dictionary_path,
"user_dictionary_path": self._user_dictionary_path,
}
def __setstate__(self, state: Dict[str, Any]) -> None:
self._system_dictionary_path = state["system_dictionary_path"]
self._user_dictionary_path = state["user_dictionary_path"]
@staticmethod
def parse_feature_for_ipadic(token: fugashi.fugashi.Node, offset: int) -> Token:
"""
Details about the ipadic parsed result:
https://taku910.github.io/mecab/
"""
return Token(
surface=token.surface,
start=offset,
end=offset + len(token.surface),
postag=token.feature[0],
lemma=None if token.feature[0] != "記号" and token.feature[6] == "*" else token.feature[6],
)
@staticmethod
def parse_feature_for_unidic(token: fugashi.fugashi.Node, offset: int) -> Token:
"""
Details about the unidic parsed result:
https://clrd.ninjal.ac.jp/unidic/faq.html
"""
return Token(
surface=token.surface,
start=offset,
end=offset + len(token.surface),
postag=token.feature[0],
lemma=token.feature[7] if len(token.feature) >= 8 else None,
)
@property
@functools.lru_cache()
def tagger(self) -> fugashi.Tagger:
user_dictionary_path = self._user_dictionary_path
system_dictionary_path = self._system_dictionary_path
if system_dictionary_path == "ipadic":
import ipadic
system_dictionary_path = ipadic.DICDIR
elif system_dictionary_path == "unidic":
import unidic
system_dictionary_path = unidic.DICDIR
elif system_dictionary_path == "unidic-lite":
import unidic_lite
system_dictionary_path = unidic_lite.DICDIR
# setup token parser
if "ipadic" in str(self._system_dictionary_path):
self.parse_feature = self.parse_feature_for_ipadic
elif "unidic" in str(self._system_dictionary_path):
self.parse_feature = self.parse_feature_for_unidic
else:
raise ValueError("system_dictionary_path must contain 'ipadic' or 'unidic'")
# setup tagger
options = ["-r /dev/null", f"-d {minato.cached_path(system_dictionary_path)}"]
if user_dictionary_path:
options.append(f"-u {minato.cached_path(user_dictionary_path)}")
return fugashi.GenericTagger(" ".join(options))
def tokenize(self, text: str) -> List[Token]:
tokens: List[Token] = []
offset = 0
for token in self.tagger(text):
offset = text.index(token.surface, offset)
tokens.append(self.parse_feature(token, offset))
offset += len(token.surface)
return tokens
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment