Created
June 8, 2013 06:04
-
-
Save ikegami-yukino/5734228 to your computer and use it in GitHub Desktop.
NAIST-JDicをIPA-Dicの文脈IDに変換
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#coding:utf-8 | |
""" | |
NAIST-JDicをIPA-Dicの文脈IDに変換 | |
以下の品詞細分類の単語は無視される | |
五段・タ行/五段・ナ行/五段・バ行/五段・ワ行ウ音便 | |
""" | |
import sys, codecs, optparse | |
if __name__ == '__main__': | |
parser = optparse.OptionParser() | |
parser.add_option("--naist", dest="naist_dic_path", help="NAIST-JDic path") | |
parser.add_option("--ipa", dest="ipa_dic_path", help="IPA-Dic path") | |
parser.add_option("-e", dest="encode", help="MeCab-Dic char code", default="eucjp") | |
parser.add_option("-o", dest="out_file", help="output file") | |
(options, args) = parser.parse_args() | |
# MozcとMeCabに共通する文脈IDを探す | |
with codecs.open(options.naist_dic_path+'/left-id.def', "r", options.encode) as naist_id_file: | |
naist_ids = [l.strip().split(' ') for l in naist_id_file] | |
with codecs.open(options.ipa_dic_path+'/left-id.def', "r", options.encode) as ipa_id_file: | |
ipa_ids = [l.strip().split(' ') for l in ipa_id_file] | |
out_file = codecs.open(options.out_file, "w", options.encode) | |
errout_file = codecs.open('error.csv', "w", options.encode) | |
common_ids = {} | |
for naist_id in naist_ids: | |
for i in xrange(len(ipa_ids)): | |
ipa_id = ipa_ids[i] | |
if naist_id[1] == ipa_id[1]: | |
common_ids[naist_id[0]] = {'ipa':ipa_id[0],'pos':ipa_id[1],'write_pos':','.join(ipa_id[1].split(',')[:-1])} | |
break | |
# 共通する文脈IDの単語をCSVに書きだす | |
ids = sorted(common_ids.keys(),key=lambda x:int(x)) | |
# for i in xrange(1,1396): | |
# if not str(i) in ids: | |
# print i | |
add_flag = False | |
for line in [ l.rstrip() for l in codecs.open(options.naist_dic_path+'/naist-jdic.csv', 'r', options.encode)]: | |
naist_data = line.split(',') | |
naist_id = naist_data[1] | |
if naist_id in common_ids: | |
id = common_ids[naist_id]['ipa'] | |
pos = common_ids[naist_id]['write_pos'] | |
add_flag = True | |
else: | |
break_flag = False | |
for pos in (','.join(naist_data[4:11]), ','.join(naist_data[4:10]), ','.join(naist_data[4:10]).replace(u'連用タ接続',u'連用形')): | |
for k,v in common_ids.items(): | |
if v['pos'] == pos or v['write_pos'] == pos: | |
id = v['ipa'] | |
pos = v['write_pos'] | |
add_flag = True | |
break_flag = True | |
break | |
if break_flag: | |
break | |
if add_flag: | |
word = naist_data[0] | |
cost = naist_data[3] | |
stem = naist_data[10] | |
yomi = naist_data[11] | |
pron = naist_data[12] | |
etc = naist_data[13] | |
etc2 = naist_data[14] if len(naist_data) > 14 else '' | |
out_file.write(u"%s,%s,%s,%s,%s,%s,%s,%s,%s,%s\n" % (word, id, id, cost, pos, stem, yomi, pron, etc, etc2)) | |
add_flag = False | |
else: | |
errout_file.write('%s\n' % line) | |
out_file.close() | |
errout_file.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment