e-mon · June 28, 2015 19:26
diff --git a/simple.py b/simple.py
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-

 import sys
 from math import log
 from collections import Counter,defaultdict
 from functools import reduce
 from itertools import chain
 import Help

 class SIMPLE:
    #-------------------------------------------------------------------------------------
    #                        set variables
    #-------------------------------------------------------------------------------------
    def __init__(self, learning_corpus):
        self.UTMAXLEN = 4
        self.KKCInput = self.Input()
        self.BT,self.UT = ("BT","UT")
        flatten_KKCInput = list(chain.from_iterable(self.KKCInput[:3])) + list(chain.from_iterable(self.KKCInput[3]))
        self.CharLogP = log(1 + len(flatten_KKCInput))

        self.LC = learning_corpus # corpus file path

        PairFreq = self.generate_model()
        self.PairFreq, self.Freq = self.smoothing(PairFreq)
        self.Dict = self.create_dictionary(PairFreq)

    #-------------------------------------------------------------------------------------
    #                        言語モデル %PairFreq の生成
    #-------------------------------------------------------------------------------------
    def generate_model(self):
        PairFeq = []

        try:
            PairFreq = map(lambda x:Counter(x.split() + [self.BT]),open(self.LC,'r').readlines()) #各行単位で単位の頻度をカウント,文末記号分を加算
            PairFreq = reduce(lambda x,y:x+y, PairFreq)                                           #マージ

        except IOError:
            print('"%s" cannot be opened.' % arg)
            quit()
        return PairFreq

    #-------------------------------------------------------------------------------------
    #                        スムージング
    #-------------------------------------------------------------------------------------
    def smoothing(self,PairFreq):
        Freq = 0                            # f() = Σf(word/kkci)↲
        keys = list(PairFreq.keys())
        for pair in keys:
            freq = PairFreq[pair]
            Freq += freq
            if freq == 1:                   # 頻度が１の場合
                PairFreq[self.UT] += freq      # f(UT) に加算して
                PairFreq.pop(pair)          # f(pair) を消去
        return PairFreq, Freq

    #-------------------------------------------------------------------------------------
    #                        仮名漢字変換辞書 %Dict の作成
    #-------------------------------------------------------------------------------------
    def create_dictionary(self,PairFreq):
        Dict = defaultdict(list)            # KKCI => <Word, KKCI>+
        for pair in PairFreq.keys():        # f(∀pair) > 0 に対するループ
            if pair in [self.BT,self.UT]:   # 特殊記号は辞書にいれない
                continue                    # 入力記号列部分
            kkci = pair.split('/')[1]       # 必要なら $Dict{$kkci} の初期化
            Dict[kkci].append(pair)         # dict(KKCI) に追加
        return Dict

    #-------------------------------------------------------------------------------------
    #                        main
    #-------------------------------------------------------------------------------------
    def convert(self,inputs):
        # 仮名漢字変換の本体
        return self.KKConv(sent, self.PairFreq, self.Freq, self.Dict)


    #-------------------------------------------------------------------------------------
    #                        入力記号集合
    #-------------------------------------------------------------------------------------
    def Input(self):
        LATINU   = "Ａ Ｂ Ｃ Ｄ Ｅ Ｆ Ｇ Ｈ Ｉ Ｊ Ｋ Ｌ Ｍ Ｎ ＯＰ Ｑ Ｒ Ｓ Ｔ Ｕ Ｖ Ｗ Ｘ Ｙ Ｚ".split()
        NUMBER   = "０ １ ２ ３ ４ ５ ６ ７ ８ ９".split()
        HIRAGANA = ("ぁ あ ぃ い ぅ う ぇ え ぉ お か が き ぎ く"+\
                   " ぐ け げ こ ご さ ざ し じ す ず せ ぜ そ ぞ た"+\
                   " だ ち ぢ っ つ づ て で と ど な に ぬ ね の は"+\
                   " ば ぱ ひ び ぴ ふ ぶ ぷ へ べ ぺ ほ ぼ ぽ ま み"+\
                   " む め も ゃ や ゅ ゆ ょ よ ら り る れ ろ ゎ わ"+\
                   " ゐ ゑ を ん").split()
        OTHERS   = [" ヴ ヵ ヶ ".split(),                     # 片仮名のみの文字
                    "ー ＝ ￥ ｀ 「 」 ； ’ 、 。".split(), # ／ => ・ (if US101)
                    "！ ＠ ＃ ＄ ％ ＾ ＆ ＊ （ ） ＿ ＋ ｜ 〜 ｛ ｝ ： ” ＜ ＞ ？".split(),
                    "・".split()]                             # for JP106 keyboard
        return LATINU, NUMBER, HIRAGANA, OTHERS


    #-------------------------------------------------------------------------------------
    #                       KKConv
    #-------------------------------------------------------------------------------------

    # 機  能 : 仮名漢字変換
    #
    # 注意点 : NODE = <PREV, $pair, $logP>;

    def KKConv(self,sent, PairFreq, Freq, Dict):
        POSI = len(sent)                                # 解析位置 $posi の最大値
        VTable = [[] for i in range(POSI+1)]            # Viterbi Table
        VTable[0].append((None, self.BT, 0))                 # DP左端

        for posi in range(1,POSI+1):                    # 解析位置(辞書引き右端)
            for _from in range(posi):                   # 開始位置(辞書引き左端)
                kkci =  sent[_from:posi]
                for pair in Dict[kkci]:                 # 既知語のループ
                    best = (None, None, 0)              # 最良のノード(の初期値)
                    for node in VTable[_from]:
                        logP = node[2] - log(PairFreq[pair] / Freq)
                        if (best[1] is None) or (logP < best[2]):
                            best = (node, pair, logP)

                    if best[1] is not None:             # 最良のノードがある場合
                        VTable[posi].append(best)       # @best をコピーして参照を記憶

                if posi - _from <= self.UTMAXLEN:            # 未知語によるノード生成
                    best = (None, None, 0)              # 最良のノード(の初期値)
                    for node in VTable[_from]:
                        logP = node[2] - log(PairFreq[self.UT] / Freq) + (posi - _from + 1)*self.CharLogP # 入力記号と単語末の BT の生成
                        if (best[1] is None) or (logP < best[2]):
                            pair = kkci + '/' + self.UT
                            best = (node, pair, logP)
                    if best[1] is not None:             # 最良のノードがある場合
                        VTable[posi].append(best)       # @best をコピーして参照を記憶

        best = (None, None, 0)                          # 最良のノード(の初期値)
        for node in VTable[POSI]:                       # $BT への遷移
            logP = node[2] - log(PairFreq[self.BT]/Freq)
            if (best[1] is None) or (logP < best[2]):
                best = (node, self.BT, logP)

        # 逆向きの探索と変換結果の表示
        result = []                             # 結果 <word, kkci>+
        node = best[0]                          # 右端のノード
        while node[0] is not None:              # ノードを左向きにたどる
            result.insert(0,node[1])            # $pair を配列に記憶していく
            node = node[0]

        return result

 if __name__ == '__main__':
    #-------------------------------------------------------------------------------------
    #                        check arguments
    #-------------------------------------------------------------------------------------
    if len(sys.argv) != 2 or sys.argv[1] == "-help":
        Help.Help('./kkc-word-1.perl')
        quit()

    LC = sys.argv[1];

    simple = SIMPLE(LC)
    inputs = sys.stdin.read().splitlines()
    for sent in inputs:
        result = simple.convert(sent.rstrip())
        print(' '.join(result))
        # print(''.join(map(lambda x:x.split('/')[0],result)))
	#!/usr/bin/env python
	# -- coding: utf-8 --

	import sys
	from math import log
	from collections import Counter,defaultdict
	from functools import reduce
	from itertools import chain
	import Help

	class SIMPLE:
	#-------------------------------------------------------------------------------------
	# set variables
	#-------------------------------------------------------------------------------------
	def __init__(self, learning_corpus):
	self.UTMAXLEN = 4
	self.KKCInput = self.Input()
	self.BT,self.UT = ("BT","UT")
	flatten_KKCInput = list(chain.from_iterable(self.KKCInput[:3])) + list(chain.from_iterable(self.KKCInput[3]))
	self.CharLogP = log(1 + len(flatten_KKCInput))

	self.LC = learning_corpus # corpus file path

	PairFreq = self.generate_model()
	self.PairFreq, self.Freq = self.smoothing(PairFreq)
	self.Dict = self.create_dictionary(PairFreq)

	#-------------------------------------------------------------------------------------
	# 言語モデル %PairFreq の生成
	#-------------------------------------------------------------------------------------
	def generate_model(self):
	PairFeq = []

	try:
	PairFreq = map(lambda x:Counter(x.split() + [self.BT]),open(self.LC,'r').readlines()) #各行単位で単位の頻度をカウント,文末記号分を加算
	PairFreq = reduce(lambda x,y:x+y, PairFreq) #マージ

	except IOError:
	print('"%s" cannot be opened.' % arg)
	quit()
	return PairFreq

	#-------------------------------------------------------------------------------------
	# スムージング
	#-------------------------------------------------------------------------------------
	def smoothing(self,PairFreq):
	Freq = 0 # f() = Σf(word/kkci)↲
	keys = list(PairFreq.keys())
	for pair in keys:
	freq = PairFreq[pair]
	Freq += freq
	if freq == 1: # 頻度が１の場合
	PairFreq[self.UT] += freq # f(UT) に加算して
	PairFreq.pop(pair) # f(pair) を消去
	return PairFreq, Freq

	#-------------------------------------------------------------------------------------
	# 仮名漢字変換辞書 %Dict の作成
	#-------------------------------------------------------------------------------------
	def create_dictionary(self,PairFreq):
	Dict = defaultdict(list) # KKCI => <Word, KKCI>+
	for pair in PairFreq.keys(): # f(∀pair) > 0 に対するループ
	if pair in [self.BT,self.UT]: # 特殊記号は辞書にいれない
	continue # 入力記号列部分
	kkci = pair.split('/')[1] # 必要なら $Dict{$kkci} の初期化
	Dict[kkci].append(pair) # dict(KKCI) に追加
	return Dict

	#-------------------------------------------------------------------------------------
	# main
	#-------------------------------------------------------------------------------------
	def convert(self,inputs):
	# 仮名漢字変換の本体
	return self.KKConv(sent, self.PairFreq, self.Freq, self.Dict)


	#-------------------------------------------------------------------------------------
	# 入力記号集合
	#-------------------------------------------------------------------------------------
	def Input(self):
	LATINU = "ＡＢＣＤＥＦＧＨＩＪＫＬＭＮＯＰＱＲＳＴＵＶＷＸＹＺ".split()
	NUMBER = "０１２３４５６７８９".split()
	HIRAGANA = ("ぁあぃいぅうぇえぉおかがきぎく"+\
	" ぐけげこごさざしじすずせぜそぞた"+\
	" だちぢっつづてでとどなにぬねのは"+\
	" ばぱひびぴふぶぷへべぺほぼぽまみ"+\
	" むめもゃやゅゆょよらりるれろゎわ"+\
	" ゐゑをん").split()
	OTHERS = [" ヴヵヶ ".split(), # 片仮名のみの文字
	"ー＝￥｀「」； ’ 、。".split(), # ／ => ・ (if US101)
	"！＠＃＄％＾＆＊（）＿＋｜〜｛｝： ” ＜＞？".split(),
	"・".split()] # for JP106 keyboard
	return LATINU, NUMBER, HIRAGANA, OTHERS


	#-------------------------------------------------------------------------------------
	# KKConv
	#-------------------------------------------------------------------------------------

	# 機能 : 仮名漢字変換
	#
	# 注意点 : NODE = <PREV, $pair, $logP>;

	def KKConv(self,sent, PairFreq, Freq, Dict):
	POSI = len(sent) # 解析位置 $posi の最大値
	VTable = [[] for i in range(POSI+1)] # Viterbi Table
	VTable[0].append((None, self.BT, 0)) # DP左端

	for posi in range(1,POSI+1): # 解析位置(辞書引き右端)
	for _from in range(posi): # 開始位置(辞書引き左端)
	kkci = sent[_from:posi]
	for pair in Dict[kkci]: # 既知語のループ
	best = (None, None, 0) # 最良のノード(の初期値)
	for node in VTable[_from]:
	logP = node[2] - log(PairFreq[pair] / Freq)
	if (best[1] is None) or (logP < best[2]):
	best = (node, pair, logP)

	if best[1] is not None: # 最良のノードがある場合
	VTable[posi].append(best) # @best をコピーして参照を記憶

	if posi - _from <= self.UTMAXLEN: # 未知語によるノード生成
	best = (None, None, 0) # 最良のノード(の初期値)
	for node in VTable[_from]:
	logP = node[2] - log(PairFreq[self.UT] / Freq) + (posi - _from + 1)*self.CharLogP # 入力記号と単語末の BT の生成
	if (best[1] is None) or (logP < best[2]):
	pair = kkci + '/' + self.UT
	best = (node, pair, logP)
	if best[1] is not None: # 最良のノードがある場合
	VTable[posi].append(best) # @best をコピーして参照を記憶

	best = (None, None, 0) # 最良のノード(の初期値)
	for node in VTable[POSI]: # $BT への遷移
	logP = node[2] - log(PairFreq[self.BT]/Freq)
	if (best[1] is None) or (logP < best[2]):
	best = (node, self.BT, logP)

	# 逆向きの探索と変換結果の表示
	result = [] # 結果 <word, kkci>+
	node = best[0] # 右端のノード
	while node[0] is not None: # ノードを左向きにたどる
	result.insert(0,node[1]) # $pair を配列に記憶していく
	node = node[0]

	return result

	if __name__ == '__main__':
	#-------------------------------------------------------------------------------------
	# check arguments
	#-------------------------------------------------------------------------------------
	if len(sys.argv) != 2 or sys.argv[1] == "-help":
	Help.Help('./kkc-word-1.perl')
	quit()

	LC = sys.argv[1];

	simple = SIMPLE(LC)
	inputs = sys.stdin.read().splitlines()
	for sent in inputs:
	result = simple.convert(sent.rstrip())
	print(' '.join(result))
	# print(''.join(map(lambda x:x.split('/')[0],result)))
No results found