Created
October 15, 2011 11:49
-
-
Save yamakk/1289460 to your computer and use it in GitHub Desktop.
mymecab.py MyMeCab().parse()
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#coding:utf-8 | |
import datetime | |
import time | |
import unicodedata | |
import MeCab | |
from HTMLParser import HTMLParser | |
import re | |
class MyMeCab(object): | |
''' | |
http://blog.livedoor.jp/techblog/archives/65828235.html | |
記事を参考にMeCabのユーザ辞書としてnormalizeしたwikipediaと顔文字データを | |
インストールしたため、それと同じnormalizeを施して形態素解析できるクラスを用意 | |
した. | |
''' | |
def __init__(self): | |
self.mecab = MeCab.Tagger('-Ochasen') | |
self._htmlparser = HTMLParser() | |
self._pattern_lf = re.compile(r'([^\n\r])[\n\r]([^\n\r])') | |
def parse(self, text): | |
return self.split(self.normalize(text)) | |
def split(self, text): | |
words = [] | |
node = self.mecab.parseToNode(text.encode('UTF-8')) | |
while node: | |
hinshi = node.feature.split(',')[0] | |
hinshi2 = node.feature.split(',')[1] | |
if hinshi == u'名詞': | |
if hinshi2 == u'一般' or hinshi2 == u'固有名詞': | |
text = node.surface.decode('utf-8').strip() | |
words.append(text) | |
node = node.next | |
return words | |
def normalize(self, text): | |
# nfkc : NFKC正規化 [ フ゜プ→ ププ ] | |
text = unicodedata.normalize('NFKC', text) | |
# lc : アルファベットを小文字に統一 [ ABC → abc ] | |
text = text.lower() | |
# HTMLエンティティをユニコード文字にデコード [ ♥ → ♥ ] | |
text = self._htmlparser.unescape(text) | |
# 波ダッシュを長音記号に置き換える [ プ~ → プー ] | |
text = text.replace(u'~', u'-') | |
# 全角マイナス記号を長音記号に置き換える [ プ- → プー ] (NFKCでできる) | |
#text = text.replace(u'−', u'-') | |
#ダッシュ全般を長音記号に置き換える [ プ— → プー ] | |
text = text.replace(u'−', u'-').replace(u'‐', u'-') | |
# drawing_lines2long : 罫線に使われる横線などを長音記号に置き換える (参考:[1] [2]) [ プ─ → プー ] | |
text = text.replace(u'─', u'-') | |
# unify_long_repeats : 連続する長音記号を長音記号一個に置き換える [ プーーー → プー ] | |
text = re.sub(u'-+', u'-', text) | |
# 単独の改行を除去 (二つ以上連続する改行は区切りと見なす) | |
text = self._pattern_lf.sub(r'\1\2', text) | |
return text | |
def test(): | |
m = MyMeCab() | |
u_kaomoji = u'ひまなう(´・ω・`)' | |
u_kaomoji_norm = u'ひまなう( ́・ω・`)' | |
assert(m.normalize(u'~') == m.normalize(u'−') \ | |
== m.normalize(u'─') == m.normalize(u'-') \ | |
== m.normalize(u'─') == m.normalize(u'------')\ | |
== m.normalize(u'~~~~')) | |
assert(m.normalize(u'♥') == u'♥') | |
assert(m.normalize(u'ABC') == u'abc') | |
assert(m.normalize(u_kaomoji) == u_kaomoji_norm) | |
assert(m.normalize(u'AAA\r\nB\nC\n\nD\r\n\nE') == u'aaa\r\nbc\n\nd\r\n\ne') | |
assert([u'ひま', u'( ́・ω・`)'] == m.parse(u_kaomoji)) | |
assert([u'akb48'] == m.parse(u'AKB48')) | |
print 'test...OK' | |
if __name__ == '__main__': | |
''' | |
9.3223669529 (class化以前、splitを呼ぶたびにMeCab()を呼んでいた) | |
0.619592905045 (class化以降 x15 faster) | |
''' | |
test() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment