ikegami-yukino · August 29, 2015 14:13
diff --git a/file0.txt b/file0.txt
 # -*- coding: utf-8 -*-
 import re
 import MeCab
 from MeCab import MECAB_ANY_BOUNDARY, MECAB_INSIDE_TOKEN, MECAB_TOKEN_BOUNDARY

 DICINFO_KEYS = ('charset', 'filename', 'lsize', 'rsize', 'size', 'type', 'version')


 class Tagger(MeCab.Tagger):

    def dictionary_info(self):
        info = MeCab._MeCab.Tagger_dictionary_info(self)
        return {key: getattr(info, key) for key in DICINFO_KEYS}

    def split_sentence(self, sentence, pattern):
        """
        Args:
            <str> sentence
            <str> pattern: regex pattern
        Returns:
            <str> token
            <bool> match
        """
        last_found_position = 0
        for m in re.finditer(pattern, sentence):
            if last_found_position < m.start():
                yield (sentence[last_found_position:m.start()], False)
                last_found_position = m.start()
            yield (sentence[last_found_position:m.end()], True)
            last_found_position = m.end()
        if last_found_position < len(sentence):
            yield (sentence[last_found_position:], False)

    def boundary_constraint_parse(self, sentence, pattern='.', any_boundary=False):
        """
        Arg:
            <list> tokens
        Return:
            <str> result
        """
        lattice = MeCab.Lattice()
        lattice.set_sentence(''.join(sentence))
        if any_boundary:
            default_boundary_constraint = MECAB_ANY_BOUNDARY
        else:
            default_boundary_constraint = MECAB_INSIDE_TOKEN

        byte_position = 0
        lattice.set_boundary_constraint(byte_position, MECAB_TOKEN_BOUNDARY)

        charset = self.dictionary_info()['charset']
        for (token, match) in self.split_sentence(sentence, pattern):
            byte_position += 1
            if match:
                boundary_constraint = MECAB_INSIDE_TOKEN
            else:
                boundary_constraint = default_boundary_constraint
            for i in range(1, len(token.encode(charset))):
                lattice.set_boundary_constraint(byte_position, boundary_constraint)
                byte_position += 1
            lattice.set_boundary_constraint(byte_position, MECAB_TOKEN_BOUNDARY)
        if self.parse(lattice):
            return lattice.toString()


 if __name__ == '__main__':
    tagger = Tagger()
    text = 'ポエム読むならQiita最高'
    print('形態素境界制約付き解析\n')
    print(tagger.boundary_constraint_parse(text, '[a-zA-Z0-9\s\-]+', any_boundary=True))

diff --git a/file1.txt b/file1.txt
 形態素境界制約付き解析

 ポエム   名詞,一般,*,*,*,*,ポエム,ポエム,ポエム
 読む  動詞,自立,*,*,五段・マ行,基本形,読む,ヨム,ヨム
 なら  助動詞,*,*,*,特殊・ダ,仮定形,だ,ナラ,ナラ
 Qiita   名詞,一般,*,*,*,*,*
 最高  名詞,一般,*,*,*,*,最高,サイコウ,サイコー
 EOS
diff --git a/file2.txt b/file2.txt
 # -*- coding: utf-8 -*-
 import MeCab

 DICINFO_KEYS = ('charset', 'filename', 'lsize', 'rsize', 'size', 'type', 'version')


 class Tagger(MeCab.Tagger):

    def dictionary_info(self):
        info = MeCab._MeCab.Tagger_dictionary_info(self)
        return {key: getattr(info, key) for key in DICINFO_KEYS}

    def feature_constraint_parse(self, tokens):
        """
        Arg:
            tokens (list of (str, str))
        Return:
            result (str)
        """
        lattice = MeCab.Lattice()
        sentence = ''.join(map(lambda x: x[0], tokens))
        lattice.set_sentence(sentence)
        start_position = 0
        charset = charset = self.dictionary_info()['charset']
        for x in tokens:
            if len(x) == 2:
                (token, pos) = x
            else:
                token = x[0]
                pos = '*'
            end_position = start_position + len(token.encode(charset))
            lattice.set_feature_constraint(start_position, end_position, pos)
            start_position = end_position
        if self.parse(lattice):
            node = lattice.begin_nodes(0)
            while node:
                yield node
                node = node.next

 if __name__ == '__main__':
    tagger = Tagger()
    print('品詞制約付き解析\n')
    labeled_tokens = [['くぅ〜', '感動詞'],
                      ['マミさん', '名詞'],
                      ['の'], ['紅茶'], ['めちゃウマ'], ['っす'], ['よ'], ['〜']]
    for node in tagger.feature_constraint_parse(labeled_tokens):
        print(node.surface, node.feature)
diff --git a/file3.txt b/file3.txt
 品詞制約付き解析

 くぅ〜 名詞,サ変接続,*,*,*,*,*
 マミさん 名詞,一般,*,*,*,*,*
 の 助詞,連体化,*,*,*,*,の,ノ,ノ
 紅茶 名詞,一般,*,*,*,*,紅茶,コウチャ,コーチャ
 めちゃウマ 名詞,一般,*,*,*,*,*
 っす 助動詞,*,*,*,特殊・デス,基本形,っす,ッス,ッス
 よ 助詞,終助詞,*,*,*,*,よ,ヨ,ヨ
 〜 記号,一般,*,*,*,*,〜,〜,〜
 BOS/EOS,*,*,*,*,*,*,*,*
	# -- coding: utf-8 --
	import re
	import MeCab
	from MeCab import MECAB_ANY_BOUNDARY, MECAB_INSIDE_TOKEN, MECAB_TOKEN_BOUNDARY

	DICINFO_KEYS = ('charset', 'filename', 'lsize', 'rsize', 'size', 'type', 'version')


	class Tagger(MeCab.Tagger):

	def dictionary_info(self):
	info = MeCab._MeCab.Tagger_dictionary_info(self)
	return {key: getattr(info, key) for key in DICINFO_KEYS}

	def split_sentence(self, sentence, pattern):
	"""
	Args:
	<str> sentence
	<str> pattern: regex pattern
	Returns:
	<str> token
	<bool> match
	"""
	last_found_position = 0
	for m in re.finditer(pattern, sentence):
	if last_found_position < m.start():
	yield (sentence[last_found_position:m.start()], False)
	last_found_position = m.start()
	yield (sentence[last_found_position:m.end()], True)
	last_found_position = m.end()
	if last_found_position < len(sentence):
	yield (sentence[last_found_position:], False)

	def boundary_constraint_parse(self, sentence, pattern='.', any_boundary=False):
	"""
	Arg:
	<list> tokens
	Return:
	<str> result
	"""
	lattice = MeCab.Lattice()
	lattice.set_sentence(''.join(sentence))
	if any_boundary:
	default_boundary_constraint = MECAB_ANY_BOUNDARY
	else:
	default_boundary_constraint = MECAB_INSIDE_TOKEN

	byte_position = 0
	lattice.set_boundary_constraint(byte_position, MECAB_TOKEN_BOUNDARY)

	charset = self.dictionary_info()['charset']
	for (token, match) in self.split_sentence(sentence, pattern):
	byte_position += 1
	if match:
	boundary_constraint = MECAB_INSIDE_TOKEN
	else:
	boundary_constraint = default_boundary_constraint
	for i in range(1, len(token.encode(charset))):
	lattice.set_boundary_constraint(byte_position, boundary_constraint)
	byte_position += 1
	lattice.set_boundary_constraint(byte_position, MECAB_TOKEN_BOUNDARY)
	if self.parse(lattice):
	return lattice.toString()


	if __name__ == '__main__':
	tagger = Tagger()
	text = 'ポエム読むならQiita最高'
	print('形態素境界制約付き解析\n')
	print(tagger.boundary_constraint_parse(text, '[a-zA-Z0-9\s\-]+', any_boundary=True))
	形態素境界制約付き解析

	ポエム名詞,一般,,,,,ポエム,ポエム,ポエム
	読む動詞,自立,,,五段・マ行,基本形,読む,ヨム,ヨム
	なら助動詞,,,*,特殊・ダ,仮定形,だ,ナラ,ナラ
	Qiita 名詞,一般,,,,,*
	最高名詞,一般,,,,,最高,サイコウ,サイコー
	EOS
	品詞制約付き解析

	くぅ〜名詞,サ変接続,,,,,*
	マミさん名詞,一般,,,,,*
	の助詞,連体化,,,,,の,ノ,ノ
	紅茶名詞,一般,,,,,紅茶,コウチャ,コーチャ
	めちゃウマ名詞,一般,,,,,*
	っす助動詞,,,*,特殊・デス,基本形,っす,ッス,ッス
	よ助詞,終助詞,,,,,よ,ヨ,ヨ
	〜記号,一般,,,,,〜,〜,〜
	BOS/EOS,,,,,,,,