Last active
August 29, 2015 14:13
-
-
Save ikegami-yukino/67856c105c1545565132 to your computer and use it in GitHub Desktop.
PythonでMeCabの制約付き解析を使う ref: http://qiita.com/yukinoi/items/4e7afb5e72b3a46da0f2
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import re | |
import MeCab | |
from MeCab import MECAB_ANY_BOUNDARY, MECAB_INSIDE_TOKEN, MECAB_TOKEN_BOUNDARY | |
DICINFO_KEYS = ('charset', 'filename', 'lsize', 'rsize', 'size', 'type', 'version') | |
class Tagger(MeCab.Tagger): | |
def dictionary_info(self): | |
info = MeCab._MeCab.Tagger_dictionary_info(self) | |
return {key: getattr(info, key) for key in DICINFO_KEYS} | |
def split_sentence(self, sentence, pattern): | |
""" | |
Args: | |
<str> sentence | |
<str> pattern: regex pattern | |
Returns: | |
<str> token | |
<bool> match | |
""" | |
last_found_position = 0 | |
for m in re.finditer(pattern, sentence): | |
if last_found_position < m.start(): | |
yield (sentence[last_found_position:m.start()], False) | |
last_found_position = m.start() | |
yield (sentence[last_found_position:m.end()], True) | |
last_found_position = m.end() | |
if last_found_position < len(sentence): | |
yield (sentence[last_found_position:], False) | |
def boundary_constraint_parse(self, sentence, pattern='.', any_boundary=False): | |
""" | |
Arg: | |
<list> tokens | |
Return: | |
<str> result | |
""" | |
lattice = MeCab.Lattice() | |
lattice.set_sentence(''.join(sentence)) | |
if any_boundary: | |
default_boundary_constraint = MECAB_ANY_BOUNDARY | |
else: | |
default_boundary_constraint = MECAB_INSIDE_TOKEN | |
byte_position = 0 | |
lattice.set_boundary_constraint(byte_position, MECAB_TOKEN_BOUNDARY) | |
charset = self.dictionary_info()['charset'] | |
for (token, match) in self.split_sentence(sentence, pattern): | |
byte_position += 1 | |
if match: | |
boundary_constraint = MECAB_INSIDE_TOKEN | |
else: | |
boundary_constraint = default_boundary_constraint | |
for i in range(1, len(token.encode(charset))): | |
lattice.set_boundary_constraint(byte_position, boundary_constraint) | |
byte_position += 1 | |
lattice.set_boundary_constraint(byte_position, MECAB_TOKEN_BOUNDARY) | |
if self.parse(lattice): | |
return lattice.toString() | |
if __name__ == '__main__': | |
tagger = Tagger() | |
text = 'ポエム読むならQiita最高' | |
print('形態素境界制約付き解析\n') | |
print(tagger.boundary_constraint_parse(text, '[a-zA-Z0-9\s\-]+', any_boundary=True)) | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
形態素境界制約付き解析 | |
ポエム 名詞,一般,*,*,*,*,ポエム,ポエム,ポエム | |
読む 動詞,自立,*,*,五段・マ行,基本形,読む,ヨム,ヨム | |
なら 助動詞,*,*,*,特殊・ダ,仮定形,だ,ナラ,ナラ | |
Qiita 名詞,一般,*,*,*,*,* | |
最高 名詞,一般,*,*,*,*,最高,サイコウ,サイコー | |
EOS |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import MeCab | |
DICINFO_KEYS = ('charset', 'filename', 'lsize', 'rsize', 'size', 'type', 'version') | |
class Tagger(MeCab.Tagger): | |
def dictionary_info(self): | |
info = MeCab._MeCab.Tagger_dictionary_info(self) | |
return {key: getattr(info, key) for key in DICINFO_KEYS} | |
def feature_constraint_parse(self, tokens): | |
""" | |
Arg: | |
tokens (list of (str, str)) | |
Return: | |
result (str) | |
""" | |
lattice = MeCab.Lattice() | |
sentence = ''.join(map(lambda x: x[0], tokens)) | |
lattice.set_sentence(sentence) | |
start_position = 0 | |
charset = charset = self.dictionary_info()['charset'] | |
for x in tokens: | |
if len(x) == 2: | |
(token, pos) = x | |
else: | |
token = x[0] | |
pos = '*' | |
end_position = start_position + len(token.encode(charset)) | |
lattice.set_feature_constraint(start_position, end_position, pos) | |
start_position = end_position | |
if self.parse(lattice): | |
node = lattice.begin_nodes(0) | |
while node: | |
yield node | |
node = node.next | |
if __name__ == '__main__': | |
tagger = Tagger() | |
print('品詞制約付き解析\n') | |
labeled_tokens = [['くぅ〜', '感動詞'], | |
['マミさん', '名詞'], | |
['の'], ['紅茶'], ['めちゃウマ'], ['っす'], ['よ'], ['〜']] | |
for node in tagger.feature_constraint_parse(labeled_tokens): | |
print(node.surface, node.feature) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
品詞制約付き解析 | |
くぅ〜 名詞,サ変接続,*,*,*,*,* | |
マミさん 名詞,一般,*,*,*,*,* | |
の 助詞,連体化,*,*,*,*,の,ノ,ノ | |
紅茶 名詞,一般,*,*,*,*,紅茶,コウチャ,コーチャ | |
めちゃウマ 名詞,一般,*,*,*,*,* | |
っす 助動詞,*,*,*,特殊・デス,基本形,っす,ッス,ッス | |
よ 助詞,終助詞,*,*,*,*,よ,ヨ,ヨ | |
〜 記号,一般,*,*,*,*,〜,〜,〜 | |
BOS/EOS,*,*,*,*,*,*,*,* |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment