Created
June 28, 2015 19:26
-
-
Save e-mon/73d53835abec0d22e51e to your computer and use it in GitHub Desktop.
SIMPLE
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
import sys | |
from math import log | |
from collections import Counter,defaultdict | |
from functools import reduce | |
from itertools import chain | |
import Help | |
class SIMPLE: | |
#------------------------------------------------------------------------------------- | |
# set variables | |
#------------------------------------------------------------------------------------- | |
def __init__(self, learning_corpus): | |
self.UTMAXLEN = 4 | |
self.KKCInput = self.Input() | |
self.BT,self.UT = ("BT","UT") | |
flatten_KKCInput = list(chain.from_iterable(self.KKCInput[:3])) + list(chain.from_iterable(self.KKCInput[3])) | |
self.CharLogP = log(1 + len(flatten_KKCInput)) | |
self.LC = learning_corpus # corpus file path | |
PairFreq = self.generate_model() | |
self.PairFreq, self.Freq = self.smoothing(PairFreq) | |
self.Dict = self.create_dictionary(PairFreq) | |
#------------------------------------------------------------------------------------- | |
# 言語モデル %PairFreq の生成 | |
#------------------------------------------------------------------------------------- | |
def generate_model(self): | |
PairFeq = [] | |
try: | |
PairFreq = map(lambda x:Counter(x.split() + [self.BT]),open(self.LC,'r').readlines()) #各行単位で単位の頻度をカウント,文末記号分を加算 | |
PairFreq = reduce(lambda x,y:x+y, PairFreq) #マージ | |
except IOError: | |
print('"%s" cannot be opened.' % arg) | |
quit() | |
return PairFreq | |
#------------------------------------------------------------------------------------- | |
# スムージング | |
#------------------------------------------------------------------------------------- | |
def smoothing(self,PairFreq): | |
Freq = 0 # f() = Σf(word/kkci)↲ | |
keys = list(PairFreq.keys()) | |
for pair in keys: | |
freq = PairFreq[pair] | |
Freq += freq | |
if freq == 1: # 頻度が1の場合 | |
PairFreq[self.UT] += freq # f(UT) に加算して | |
PairFreq.pop(pair) # f(pair) を消去 | |
return PairFreq, Freq | |
#------------------------------------------------------------------------------------- | |
# 仮名漢字変換辞書 %Dict の作成 | |
#------------------------------------------------------------------------------------- | |
def create_dictionary(self,PairFreq): | |
Dict = defaultdict(list) # KKCI => <Word, KKCI>+ | |
for pair in PairFreq.keys(): # f(∀pair) > 0 に対するループ | |
if pair in [self.BT,self.UT]: # 特殊記号は辞書にいれない | |
continue # 入力記号列部分 | |
kkci = pair.split('/')[1] # 必要なら $Dict{$kkci} の初期化 | |
Dict[kkci].append(pair) # dict(KKCI) に追加 | |
return Dict | |
#------------------------------------------------------------------------------------- | |
# main | |
#------------------------------------------------------------------------------------- | |
def convert(self,inputs): | |
# 仮名漢字変換の本体 | |
return self.KKConv(sent, self.PairFreq, self.Freq, self.Dict) | |
#------------------------------------------------------------------------------------- | |
# 入力記号集合 | |
#------------------------------------------------------------------------------------- | |
def Input(self): | |
LATINU = "A B C D E F G H I J K L M N OP Q R S T U V W X Y Z".split() | |
NUMBER = "0 1 2 3 4 5 6 7 8 9".split() | |
HIRAGANA = ("ぁ あ ぃ い ぅ う ぇ え ぉ お か が き ぎ く"+\ | |
" ぐ け げ こ ご さ ざ し じ す ず せ ぜ そ ぞ た"+\ | |
" だ ち ぢ っ つ づ て で と ど な に ぬ ね の は"+\ | |
" ば ぱ ひ び ぴ ふ ぶ ぷ へ べ ぺ ほ ぼ ぽ ま み"+\ | |
" む め も ゃ や ゅ ゆ ょ よ ら り る れ ろ ゎ わ"+\ | |
" ゐ ゑ を ん").split() | |
OTHERS = [" ヴ ヵ ヶ ".split(), # 片仮名のみの文字 | |
"ー = ¥ ` 「 」 ; ’ 、 。".split(), # / => ・ (if US101) | |
"! @ # $ % ^ & * ( ) _ + | 〜 { } : ” < > ?".split(), | |
"・".split()] # for JP106 keyboard | |
return LATINU, NUMBER, HIRAGANA, OTHERS | |
#------------------------------------------------------------------------------------- | |
# KKConv | |
#------------------------------------------------------------------------------------- | |
# 機 能 : 仮名漢字変換 | |
# | |
# 注意点 : NODE = <PREV, $pair, $logP>; | |
def KKConv(self,sent, PairFreq, Freq, Dict): | |
POSI = len(sent) # 解析位置 $posi の最大値 | |
VTable = [[] for i in range(POSI+1)] # Viterbi Table | |
VTable[0].append((None, self.BT, 0)) # DP左端 | |
for posi in range(1,POSI+1): # 解析位置(辞書引き右端) | |
for _from in range(posi): # 開始位置(辞書引き左端) | |
kkci = sent[_from:posi] | |
for pair in Dict[kkci]: # 既知語のループ | |
best = (None, None, 0) # 最良のノード(の初期値) | |
for node in VTable[_from]: | |
logP = node[2] - log(PairFreq[pair] / Freq) | |
if (best[1] is None) or (logP < best[2]): | |
best = (node, pair, logP) | |
if best[1] is not None: # 最良のノードがある場合 | |
VTable[posi].append(best) # @best をコピーして参照を記憶 | |
if posi - _from <= self.UTMAXLEN: # 未知語によるノード生成 | |
best = (None, None, 0) # 最良のノード(の初期値) | |
for node in VTable[_from]: | |
logP = node[2] - log(PairFreq[self.UT] / Freq) + (posi - _from + 1)*self.CharLogP # 入力記号と単語末の BT の生成 | |
if (best[1] is None) or (logP < best[2]): | |
pair = kkci + '/' + self.UT | |
best = (node, pair, logP) | |
if best[1] is not None: # 最良のノードがある場合 | |
VTable[posi].append(best) # @best をコピーして参照を記憶 | |
best = (None, None, 0) # 最良のノード(の初期値) | |
for node in VTable[POSI]: # $BT への遷移 | |
logP = node[2] - log(PairFreq[self.BT]/Freq) | |
if (best[1] is None) or (logP < best[2]): | |
best = (node, self.BT, logP) | |
# 逆向きの探索と変換結果の表示 | |
result = [] # 結果 <word, kkci>+ | |
node = best[0] # 右端のノード | |
while node[0] is not None: # ノードを左向きにたどる | |
result.insert(0,node[1]) # $pair を配列に記憶していく | |
node = node[0] | |
return result | |
if __name__ == '__main__': | |
#------------------------------------------------------------------------------------- | |
# check arguments | |
#------------------------------------------------------------------------------------- | |
if len(sys.argv) != 2 or sys.argv[1] == "-help": | |
Help.Help('./kkc-word-1.perl') | |
quit() | |
LC = sys.argv[1]; | |
simple = SIMPLE(LC) | |
inputs = sys.stdin.read().splitlines() | |
for sent in inputs: | |
result = simple.convert(sent.rstrip()) | |
print(' '.join(result)) | |
# print(''.join(map(lambda x:x.split('/')[0],result))) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment