altescy · January 16, 2023 04:53
diff --git a/fugashi_tokenizer.py b/fugashi_tokenizer.py
 import abc
 import functools
 from os import PathLike
 from typing import Any, Callable, Dict, List, NamedTuple, Optional, Union

 import fugashi
 import minato


 class Token(NamedTuple):
    surface: str
    start: int
    end: int
    postag: Optional[str] = None
    lemma: Optional[str] = None


 class Tokenizer(abc.ABC):
    @abc.abstractmethod
    def tokenize(self, text: str) -> List[Token]:
        raise NotImplementedError


 class FugashiTokenizer(Tokenizer):
    parse_feature: Callable[[fugashi.fugashi.Node, int], Token]

    def __init__(
        self,
        system_dictionary_path: Optional[Union[str, PathLike]] = None,
        user_dictionary_path: Optional[Union[str, PathLike]] = None,
    ) -> None:
        self._system_dictionary_path = system_dictionary_path or "unidic-lite"
        self._user_dictionary_path = user_dictionary_path

    def __getstate__(self) -> Dict[str, Any]:
        return {
            "system_dictionary_path": self._system_dictionary_path,
            "user_dictionary_path": self._user_dictionary_path,
        }

    def __setstate__(self, state: Dict[str, Any]) -> None:
        self._system_dictionary_path = state["system_dictionary_path"]
        self._user_dictionary_path = state["user_dictionary_path"]

    @staticmethod
    def parse_feature_for_ipadic(token: fugashi.fugashi.Node, offset: int) -> Token:
        """
        Details about the ipadic parsed result:
        https://taku910.github.io/mecab/
        """
        return Token(
            surface=token.surface,
            start=offset,
            end=offset + len(token.surface),
            postag=token.feature[0],
            lemma=None if token.feature[0] != "記号" and token.feature[6] == "*" else token.feature[6],
        )

    @staticmethod
    def parse_feature_for_unidic(token: fugashi.fugashi.Node, offset: int) -> Token:
        """
        Details about the unidic parsed result:
        https://clrd.ninjal.ac.jp/unidic/faq.html
        """
        return Token(
            surface=token.surface,
            start=offset,
            end=offset + len(token.surface),
            postag=token.feature[0],
            lemma=token.feature[7] if len(token.feature) >= 8 else None,
        )

    @property
    @functools.lru_cache()
    def tagger(self) -> fugashi.Tagger:
        user_dictionary_path = self._user_dictionary_path
        system_dictionary_path = self._system_dictionary_path
        if system_dictionary_path == "ipadic":
            import ipadic

            system_dictionary_path = ipadic.DICDIR
        elif system_dictionary_path == "unidic":
            import unidic

            system_dictionary_path = unidic.DICDIR
        elif system_dictionary_path == "unidic-lite":
            import unidic_lite

            system_dictionary_path = unidic_lite.DICDIR

        # setup token parser
        if "ipadic" in str(self._system_dictionary_path):
            self.parse_feature = self.parse_feature_for_ipadic
        elif "unidic" in str(self._system_dictionary_path):
            self.parse_feature = self.parse_feature_for_unidic
        else:
            raise ValueError("system_dictionary_path must contain 'ipadic' or 'unidic'")

        # setup tagger
        options = ["-r /dev/null", f"-d {minato.cached_path(system_dictionary_path)}"]
        if user_dictionary_path:
            options.append(f"-u {minato.cached_path(user_dictionary_path)}")

        return fugashi.GenericTagger(" ".join(options))

    def tokenize(self, text: str) -> List[Token]:
        tokens: List[Token] = []
        offset = 0
        for token in self.tagger(text):
            offset = text.index(token.surface, offset)
            tokens.append(self.parse_feature(token, offset))
            offset += len(token.surface)
        return tokens
	import abc
	import functools
	from os import PathLike
	from typing import Any, Callable, Dict, List, NamedTuple, Optional, Union

	import fugashi
	import minato


	class Token(NamedTuple):
	surface: str
	start: int
	end: int
	postag: Optional[str] = None
	lemma: Optional[str] = None


	class Tokenizer(abc.ABC):
	@abc.abstractmethod
	def tokenize(self, text: str) -> List[Token]:
	raise NotImplementedError


	class FugashiTokenizer(Tokenizer):
	parse_feature: Callable[[fugashi.fugashi.Node, int], Token]

	def __init__(
	self,
	system_dictionary_path: Optional[Union[str, PathLike]] = None,
	user_dictionary_path: Optional[Union[str, PathLike]] = None,
	) -> None:
	self._system_dictionary_path = system_dictionary_path or "unidic-lite"
	self._user_dictionary_path = user_dictionary_path

	def __getstate__(self) -> Dict[str, Any]:
	return {
	"system_dictionary_path": self._system_dictionary_path,
	"user_dictionary_path": self._user_dictionary_path,
	}

	def __setstate__(self, state: Dict[str, Any]) -> None:
	self._system_dictionary_path = state["system_dictionary_path"]
	self._user_dictionary_path = state["user_dictionary_path"]

	@staticmethod
	def parse_feature_for_ipadic(token: fugashi.fugashi.Node, offset: int) -> Token:
	"""
	Details about the ipadic parsed result:
	https://taku910.github.io/mecab/
	"""
	return Token(
	surface=token.surface,
	start=offset,
	end=offset + len(token.surface),
	postag=token.feature[0],
	lemma=None if token.feature[0] != "記号" and token.feature[6] == "*" else token.feature[6],
	)

	@staticmethod
	def parse_feature_for_unidic(token: fugashi.fugashi.Node, offset: int) -> Token:
	"""
	Details about the unidic parsed result:
	https://clrd.ninjal.ac.jp/unidic/faq.html
	"""
	return Token(
	surface=token.surface,
	start=offset,
	end=offset + len(token.surface),
	postag=token.feature[0],
	lemma=token.feature[7] if len(token.feature) >= 8 else None,
	)

	@property
	@functools.lru_cache()
	def tagger(self) -> fugashi.Tagger:
	user_dictionary_path = self._user_dictionary_path
	system_dictionary_path = self._system_dictionary_path
	if system_dictionary_path == "ipadic":
	import ipadic

	system_dictionary_path = ipadic.DICDIR
	elif system_dictionary_path == "unidic":
	import unidic

	system_dictionary_path = unidic.DICDIR
	elif system_dictionary_path == "unidic-lite":
	import unidic_lite

	system_dictionary_path = unidic_lite.DICDIR

	# setup token parser
	if "ipadic" in str(self._system_dictionary_path):
	self.parse_feature = self.parse_feature_for_ipadic
	elif "unidic" in str(self._system_dictionary_path):
	self.parse_feature = self.parse_feature_for_unidic
	else:
	raise ValueError("system_dictionary_path must contain 'ipadic' or 'unidic'")

	# setup tagger
	options = ["-r /dev/null", f"-d {minato.cached_path(system_dictionary_path)}"]
	if user_dictionary_path:
	options.append(f"-u {minato.cached_path(user_dictionary_path)}")

	return fugashi.GenericTagger(" ".join(options))

	def tokenize(self, text: str) -> List[Token]:
	tokens: List[Token] = []
	offset = 0
	for token in self.tagger(text):
	offset = text.index(token.surface, offset)
	tokens.append(self.parse_feature(token, offset))
	offset += len(token.surface)
	return tokens
No results found