Created
May 22, 2013 08:32
-
-
Save shunsukeaihara/5626098 to your computer and use it in GitHub Desktop.
cabochaのラッパーのベース
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import CaboCha | |
import chardet | |
import syslog | |
import math | |
from collections import defaultdict | |
import unicodedata | |
syslog.openlog('cabocha_wrapper',syslog.LOG_PID|syslog.LOG_PERROR,syslog.LOG_SYSLOG) | |
AVG_LENGTH = 200.0 | |
def is_japanese_string(string): | |
pass | |
def to_utf8(string,encode=None): | |
if not encode: | |
encode = chardet.detect(string)["encoding"] | |
uni = unicode(string,encode) | |
return unicodedata.normalize("NFKC", uni).encode('utf-8') | |
def is_contentword(token): | |
pos_base = token.feature_list(0) | |
if pos_base == "動詞": | |
return token.feature_list(6) | |
elif pos_base == "名詞": | |
return token.surface(6) | |
elif pos_base == "形容詞": | |
return token.feature_list(6) | |
elif pos_base == "副詞": | |
return token.feature_list(6) | |
else: | |
return None | |
class Phrase(object): | |
def __init__(self,chunk,phrases,cid): | |
self._chunk = chunk | |
self._sentence = phrases | |
self._cid = cid | |
self._dependants = [] | |
self._tokens = [] | |
self._head = None | |
def push_token(self,token): | |
self._tokens.append(token) | |
def set_dependency(self): | |
if self._chunk.link>0: | |
self._head = self._sentence[self._chunk.link] | |
self._head.add_dependant(self._cid) | |
def add_dependant(self,cid): | |
self._dependants.append(cid) | |
def is_ne(self): | |
for token in self._tokens: | |
if token.ne!='O': | |
return token.ne | |
return None | |
def is_num(self): | |
for token in self._tokens: | |
if token.feature_list(1)=="数": | |
return True | |
return False | |
@property | |
def surface(self): | |
return "".join([token.surface for token in self._tokens]) | |
@property | |
def tokens(self): | |
return self._tokens | |
@property | |
def base_tokens(self): | |
return [t.feature_list(6) for t in self._tokens] | |
class CaboChaWrapper(object): | |
def __init__(self,option): | |
self._cabo = CaboCha.Parser(option) | |
def parse_sentence(self,sentence): | |
tree = self._cabo.parse(sentence) | |
phrases=[] | |
try: | |
size = tree.size() | |
except: | |
syslog.syslog(syslog.LOG_ALERT,'cabocha failed! %s'%sentence) | |
return phrases | |
phrase = None | |
cid = 0 | |
for i in xrange(size): | |
token = tree.token(i) | |
if token.chunk: | |
if phrase: | |
phrases.append(phrase) | |
phrase=Phrase(token.chunk,phrases,cid) | |
cid += 1 | |
phrase.push_token(token) | |
if phrase: | |
phrases.append(phrase) | |
for phrase in phrases: | |
phrase.set_dependency() | |
return phrases | |
class TextFeatureExtractor(object): | |
def __init__(self,option=None): | |
if option==None: | |
option="" | |
self._cabo = CaboChaWrapper(option) | |
def to_dictionary(self,vec): | |
dic = defaultdict(int) | |
for v in vec: | |
if not(v): | |
continue | |
dic[v]+=1 | |
return dic | |
def to_weighted_dictionary(self,vec): | |
dic = defaultdict(float) | |
for i, v in enumerate(vec): | |
if not(v): | |
continue | |
dic[unicode(v,'utf-8')]+=math.exp(-i/AVG_LENGTH) | |
return dic | |
def remove_none(self,vec): | |
return [unicode(token,'utf-8') for token in vec if token] | |
def parse(self,string): | |
""" | |
vectype: | |
normal: term frequency dict | |
weighted: word position weighted frequency dict | |
list: return a list not a dictonary | |
""" | |
tokens = [] | |
string = to_utf8(string) | |
for sentence in string.split('。'): | |
phrases = self._cabo.parse_sentence(sentence+"。") | |
for phrase in phrases: | |
if phrase.is_num():#if phrase contains number | |
#後で数詞正規化処理を入れる。現在は数詞を含む名詞句は読み飛ばす | |
for token in phrase.tokens: | |
tokens.append(None) | |
else: #後で固有表現の扱いを考える | |
for token in phrase.tokens: | |
tokens.append(is_contentword(token)) | |
return tokens |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment