Last active
January 16, 2023 04:53
-
-
Save altescy/375307e2bd6a863bf6972861433da135 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import abc | |
| import functools | |
| from os import PathLike | |
| from typing import Any, Callable, Dict, List, NamedTuple, Optional, Union | |
| import fugashi | |
| import minato | |
| class Token(NamedTuple): | |
| surface: str | |
| start: int | |
| end: int | |
| postag: Optional[str] = None | |
| lemma: Optional[str] = None | |
| class Tokenizer(abc.ABC): | |
| @abc.abstractmethod | |
| def tokenize(self, text: str) -> List[Token]: | |
| raise NotImplementedError | |
| class FugashiTokenizer(Tokenizer): | |
| parse_feature: Callable[[fugashi.fugashi.Node, int], Token] | |
| def __init__( | |
| self, | |
| system_dictionary_path: Optional[Union[str, PathLike]] = None, | |
| user_dictionary_path: Optional[Union[str, PathLike]] = None, | |
| ) -> None: | |
| self._system_dictionary_path = system_dictionary_path or "unidic-lite" | |
| self._user_dictionary_path = user_dictionary_path | |
| def __getstate__(self) -> Dict[str, Any]: | |
| return { | |
| "system_dictionary_path": self._system_dictionary_path, | |
| "user_dictionary_path": self._user_dictionary_path, | |
| } | |
| def __setstate__(self, state: Dict[str, Any]) -> None: | |
| self._system_dictionary_path = state["system_dictionary_path"] | |
| self._user_dictionary_path = state["user_dictionary_path"] | |
| @staticmethod | |
| def parse_feature_for_ipadic(token: fugashi.fugashi.Node, offset: int) -> Token: | |
| """ | |
| Details about the ipadic parsed result: | |
| https://taku910.github.io/mecab/ | |
| """ | |
| return Token( | |
| surface=token.surface, | |
| start=offset, | |
| end=offset + len(token.surface), | |
| postag=token.feature[0], | |
| lemma=None if token.feature[0] != "記号" and token.feature[6] == "*" else token.feature[6], | |
| ) | |
| @staticmethod | |
| def parse_feature_for_unidic(token: fugashi.fugashi.Node, offset: int) -> Token: | |
| """ | |
| Details about the unidic parsed result: | |
| https://clrd.ninjal.ac.jp/unidic/faq.html | |
| """ | |
| return Token( | |
| surface=token.surface, | |
| start=offset, | |
| end=offset + len(token.surface), | |
| postag=token.feature[0], | |
| lemma=token.feature[7] if len(token.feature) >= 8 else None, | |
| ) | |
| @property | |
| @functools.lru_cache() | |
| def tagger(self) -> fugashi.Tagger: | |
| user_dictionary_path = self._user_dictionary_path | |
| system_dictionary_path = self._system_dictionary_path | |
| if system_dictionary_path == "ipadic": | |
| import ipadic | |
| system_dictionary_path = ipadic.DICDIR | |
| elif system_dictionary_path == "unidic": | |
| import unidic | |
| system_dictionary_path = unidic.DICDIR | |
| elif system_dictionary_path == "unidic-lite": | |
| import unidic_lite | |
| system_dictionary_path = unidic_lite.DICDIR | |
| # setup token parser | |
| if "ipadic" in str(self._system_dictionary_path): | |
| self.parse_feature = self.parse_feature_for_ipadic | |
| elif "unidic" in str(self._system_dictionary_path): | |
| self.parse_feature = self.parse_feature_for_unidic | |
| else: | |
| raise ValueError("system_dictionary_path must contain 'ipadic' or 'unidic'") | |
| # setup tagger | |
| options = ["-r /dev/null", f"-d {minato.cached_path(system_dictionary_path)}"] | |
| if user_dictionary_path: | |
| options.append(f"-u {minato.cached_path(user_dictionary_path)}") | |
| return fugashi.GenericTagger(" ".join(options)) | |
| def tokenize(self, text: str) -> List[Token]: | |
| tokens: List[Token] = [] | |
| offset = 0 | |
| for token in self.tagger(text): | |
| offset = text.index(token.surface, offset) | |
| tokens.append(self.parse_feature(token, offset)) | |
| offset += len(token.surface) | |
| return tokens |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment