-
-
Save shimizukawa/5931218 to your computer and use it in GitHub Desktop.
MeCabの出力結果を基に接頭辞や接尾辞を連結する。fork元のコードと機能は同じ。リファクタリングしました。
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding:utf-8 -*- | |
import re | |
from collections import OrderedDict #python2.7 or later | |
import MeCab | |
class DetermineDescriptor(object): | |
def __init__(self, *allowed_features): | |
self.allowed_features = allowed_features | |
def __get__(self, instance, klass): | |
return any(map(instance.feature.startswith, self.allowed_features)) | |
class Word(object): | |
def __init__(self,surface,feature): | |
self.surface = surface | |
self.feature = feature | |
def __repr__(self): | |
return u'<{0.__class__.__name__} "{0.surface}", "{0.feature}">'.format(self).encode('utf-8') | |
@property | |
def is_connected(self): | |
allowed_feature = ( | |
u"名詞,一般", | |
u"名詞,数", | |
u"名詞,サ変接続", | |
u"名詞,接尾,一般", | |
u"名詞,接尾,サ変接続", | |
u"名詞,固有名詞", | |
u"名詞,形容動詞語幹", | |
u"名詞,副詞可能", | |
u"記号,アルファベット", | |
) | |
disallowed_symbols = set(map(lambda x:x, u"()[]<>|\"';,")) | |
return (self.feature.startswith(allowed_feature) and | |
self.surface not in disallowed_symbols) | |
is_adjective = DetermineDescriptor( | |
u"名詞,形容動詞語幹", | |
u"名詞,ナイ形容詞語幹", | |
) | |
is_prefix = DetermineDescriptor( | |
u"接頭詞,名詞接続", | |
) | |
is_postfix = DetermineDescriptor( | |
u"名詞,接尾,形容動詞語幹", | |
u"名詞,接尾,一般", | |
) | |
is_digit_prefix = DetermineDescriptor( | |
u"接頭詞,数接続", | |
) | |
is_numerative = DetermineDescriptor( | |
u"名詞,接尾,助数詞", | |
) | |
is_digit = DetermineDescriptor( | |
u"名詞,数", | |
) | |
is_noun = DetermineDescriptor( | |
u"名詞", | |
) | |
@property | |
def is_digit_only(self): | |
return self.surface.isdigit() | |
_is_symbol_only = re.compile(r'^[!"#\$\%\&\'\(\)\*\+,\-\./:;\<\=\>\?\@\[\\\]\^\_\`\{\}\~\|]+$').search | |
@property | |
def is_symbol_only(self): | |
return self._is_symbol_only(self.surface) | |
def combine(self, other): | |
if self.is_prefix and other.is_connected: | |
## 接頭辞は次の名詞に繋ぐ | |
self.surface += other.surface | |
other.is_noun = True | |
other.is_prefix = False | |
elif self.is_noun and other.is_postfix: | |
## 接尾辞は一個前が名詞だったら繋ぐ | |
self.surface += other.surface | |
other.is_noun = False | |
other.is_prefix = False | |
elif self.is_digit_prefix and other.is_digit: | |
## 数接続の接頭詞の次に数字だったらつなぐ | |
self.surface += other.surface | |
other.is_digit = True | |
other.is_digit_prefix = False | |
elif self.is_digit and other.is_numerative: | |
## 数字の次に助数詞が来たらつなげる | |
self.surface += other.surface | |
other.is_digit = False | |
other.is_noun = False | |
else: | |
if (other.is_connected or other.is_adjective or other.is_postfix or | |
other.is_prefix or other.is_digit_prefix or other.is_noun | |
): | |
other.is_noun = other.is_connected | |
return False #selfを確定 | |
else: | |
other.is_noun = other.is_connected | |
return True #selfは未確定 | |
def mecab_parse_iterator(tagger, text): | |
"""tagger(MeCab.Tagger instance), text(unicode) => word(unicode), feature(unicode)""" | |
text = text.encode('utf-8') | |
node = tagger.parseToNode(text) | |
while node: | |
yield Word(node.surface.decode('utf-8'), node.feature.decode('utf-8')) | |
node = node.next | |
def extract_noun(text): | |
pword = Word('', '') | |
result = [] | |
for word in mecab_parse_iterator(MeCab.Tagger(), text): | |
if not pword.combine(word): | |
result.append(word) | |
pword = word | |
worddic = OrderedDict() | |
for word in result: | |
if not (word.is_digit_only or word.is_symbol_only): | |
worddic[word.surface] = worddic.get(word.surface, 0) + 1 | |
return worddic | |
if __name__ == "__main__": | |
import sys | |
text = sys.argv[1] | |
worddic = extract_noun(text) | |
for word,num in worddic.items(): | |
print word,num |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment