shimizukawa · June 6, 2023 02:59
diff --git a/extractword.py b/extractword.py
 #!/usr/bin/env python
 # -*- coding:utf-8 -*-
 import re
 from collections import OrderedDict  #python2.7 or later

 import MeCab


 class DetermineDescriptor(object):

    def __init__(self, *allowed_features):
        self.allowed_features = allowed_features

    def __get__(self, instance, klass):
        return any(map(instance.feature.startswith, self.allowed_features))


 class Word(object):

    def __init__(self,surface,feature):
        self.surface = surface
        self.feature = feature

    def __repr__(self):
        return u'<{0.__class__.__name__} "{0.surface}", "{0.feature}">'.format(self).encode('utf-8')

    @property
    def is_connected(self):
        allowed_feature = (
            u"名詞,一般",
            u"名詞,数",
            u"名詞,サ変接続",
            u"名詞,接尾,一般",
            u"名詞,接尾,サ変接続",
            u"名詞,固有名詞",
            u"名詞,形容動詞語幹",
            u"名詞,副詞可能",
            u"記号,アルファベット",
        )
        disallowed_symbols = set(map(lambda x:x, u"()[]<>|\"';,"))
        return (self.feature.startswith(allowed_feature) and
                self.surface not in disallowed_symbols)

    is_adjective = DetermineDescriptor(
        u"名詞,形容動詞語幹",
        u"名詞,ナイ形容詞語幹",
        )

    is_prefix = DetermineDescriptor(
        u"接頭詞,名詞接続",
        )

    is_postfix = DetermineDescriptor(
        u"名詞,接尾,形容動詞語幹",
        u"名詞,接尾,一般",
        )

    is_digit_prefix = DetermineDescriptor(
        u"接頭詞,数接続",
        )

    is_numerative = DetermineDescriptor(
        u"名詞,接尾,助数詞",
        )

    is_digit = DetermineDescriptor(
        u"名詞,数",
        )

    is_noun = DetermineDescriptor(
        u"名詞",
        )

    @property
    def is_digit_only(self):
        return self.surface.isdigit()

    _is_symbol_only = re.compile(r'^[!"#\$\%\&\'\(\)\*\+,\-\./:;\<\=\>\?\@\[\\\]\^\_\`\{\}\~\|]+$').search

    @property
    def is_symbol_only(self):
        return self._is_symbol_only(self.surface)

    def combine(self, other):
        if self.is_prefix and other.is_connected:
            ## 接頭辞は次の名詞に繋ぐ
            self.surface += other.surface
            other.is_noun = True
            other.is_prefix = False

        elif self.is_noun and other.is_postfix:
            ## 接尾辞は一個前が名詞だったら繋ぐ
            self.surface += other.surface
            other.is_noun = False
            other.is_prefix = False

        elif self.is_digit_prefix and other.is_digit:
            ## 数接続の接頭詞の次に数字だったらつなぐ
            self.surface += other.surface
            other.is_digit = True
            other.is_digit_prefix = False

        elif self.is_digit and other.is_numerative:
            ## 数字の次に助数詞が来たらつなげる
            self.surface += other.surface
            other.is_digit = False
            other.is_noun = False

        else:
            if (other.is_connected or other.is_adjective or other.is_postfix or
                other.is_prefix or other.is_digit_prefix or other.is_noun
               ):
                other.is_noun = other.is_connected
                return False  #selfを確定
            else:
                other.is_noun = other.is_connected

        return True  #selfは未確定


 def mecab_parse_iterator(tagger, text):
    """tagger(MeCab.Tagger instance), text(unicode) => word(unicode), feature(unicode)"""
    text = text.encode('utf-8')
    node = tagger.parseToNode(text)
    while node:
        yield Word(node.surface.decode('utf-8'), node.feature.decode('utf-8'))
        node = node.next


 def extract_noun(text):
    pword = Word('', '')
    result = []
    for word in mecab_parse_iterator(MeCab.Tagger(), text):
        if not pword.combine(word):
            result.append(word)
        pword = word

    worddic = OrderedDict()
    for word in result:
        if not (word.is_digit_only or word.is_symbol_only):
            worddic[word.surface] = worddic.get(word.surface, 0) + 1

    return worddic


 if __name__ == "__main__":
    import sys
    text = sys.argv[1]
    worddic = extract_noun(text)
    for word,num in worddic.items():
        print word,num
	#!/usr/bin/env python
	# -- coding:utf-8 --
	import re
	from collections import OrderedDict #python2.7 or later

	import MeCab


	class DetermineDescriptor(object):

	def __init__(self, *allowed_features):
	self.allowed_features = allowed_features

	def __get__(self, instance, klass):
	return any(map(instance.feature.startswith, self.allowed_features))


	class Word(object):

	def __init__(self,surface,feature):
	self.surface = surface
	self.feature = feature

	def __repr__(self):
	return u'<{0.__class__.__name__} "{0.surface}", "{0.feature}">'.format(self).encode('utf-8')

	@property
	def is_connected(self):
	allowed_feature = (
	u"名詞,一般",
	u"名詞,数",
	u"名詞,サ変接続",
	u"名詞,接尾,一般",
	u"名詞,接尾,サ変接続",
	u"名詞,固有名詞",
	u"名詞,形容動詞語幹",
	u"名詞,副詞可能",
	u"記号,アルファベット",
	)
	disallowed_symbols = set(map(lambda x:x, u"()[]<>\|\"';,"))
	return (self.feature.startswith(allowed_feature) and
	self.surface not in disallowed_symbols)

	is_adjective = DetermineDescriptor(
	u"名詞,形容動詞語幹",
	u"名詞,ナイ形容詞語幹",
	)

	is_prefix = DetermineDescriptor(
	u"接頭詞,名詞接続",
	)

	is_postfix = DetermineDescriptor(
	u"名詞,接尾,形容動詞語幹",
	u"名詞,接尾,一般",
	)

	is_digit_prefix = DetermineDescriptor(
	u"接頭詞,数接続",
	)

	is_numerative = DetermineDescriptor(
	u"名詞,接尾,助数詞",
	)

	is_digit = DetermineDescriptor(
	u"名詞,数",
	)

	is_noun = DetermineDescriptor(
	u"名詞",
	)

	@property
	def is_digit_only(self):
	return self.surface.isdigit()

	_is_symbol_only = re.compile(r'^[!"#\$\%\&\'\(\)\*\+,\-\./:;\<\=\>\?\@\[\\\]\^\_\`\{\}\~\\|]+$').search

	@property
	def is_symbol_only(self):
	return self._is_symbol_only(self.surface)

	def combine(self, other):
	if self.is_prefix and other.is_connected:
	## 接頭辞は次の名詞に繋ぐ
	self.surface += other.surface
	other.is_noun = True
	other.is_prefix = False

	elif self.is_noun and other.is_postfix:
	## 接尾辞は一個前が名詞だったら繋ぐ
	self.surface += other.surface
	other.is_noun = False
	other.is_prefix = False

	elif self.is_digit_prefix and other.is_digit:
	## 数接続の接頭詞の次に数字だったらつなぐ
	self.surface += other.surface
	other.is_digit = True
	other.is_digit_prefix = False

	elif self.is_digit and other.is_numerative:
	## 数字の次に助数詞が来たらつなげる
	self.surface += other.surface
	other.is_digit = False
	other.is_noun = False

	else:
	if (other.is_connected or other.is_adjective or other.is_postfix or
	other.is_prefix or other.is_digit_prefix or other.is_noun
	):
	other.is_noun = other.is_connected
	return False #selfを確定
	else:
	other.is_noun = other.is_connected

	return True #selfは未確定


	def mecab_parse_iterator(tagger, text):
	"""tagger(MeCab.Tagger instance), text(unicode) => word(unicode), feature(unicode)"""
	text = text.encode('utf-8')
	node = tagger.parseToNode(text)
	while node:
	yield Word(node.surface.decode('utf-8'), node.feature.decode('utf-8'))
	node = node.next


	def extract_noun(text):
	pword = Word('', '')
	result = []
	for word in mecab_parse_iterator(MeCab.Tagger(), text):
	if not pword.combine(word):
	result.append(word)
	pword = word

	worddic = OrderedDict()
	for word in result:
	if not (word.is_digit_only or word.is_symbol_only):
	worddic[word.surface] = worddic.get(word.surface, 0) + 1

	return worddic


	if __name__ == "__main__":
	import sys
	text = sys.argv[1]
	worddic = extract_noun(text)
	for word,num in worddic.items():
	print word,num