Skip to content

Instantly share code, notes, and snippets.

@ikegami-yukino
Created June 8, 2013 06:04
Show Gist options
  • Save ikegami-yukino/5734228 to your computer and use it in GitHub Desktop.
Save ikegami-yukino/5734228 to your computer and use it in GitHub Desktop.
NAIST-JDicをIPA-Dicの文脈IDに変換
#coding:utf-8
"""
NAIST-JDicをIPA-Dicの文脈IDに変換
以下の品詞細分類の単語は無視される
五段・タ行/五段・ナ行/五段・バ行/五段・ワ行ウ音便
"""
import sys, codecs, optparse
if __name__ == '__main__':
parser = optparse.OptionParser()
parser.add_option("--naist", dest="naist_dic_path", help="NAIST-JDic path")
parser.add_option("--ipa", dest="ipa_dic_path", help="IPA-Dic path")
parser.add_option("-e", dest="encode", help="MeCab-Dic char code", default="eucjp")
parser.add_option("-o", dest="out_file", help="output file")
(options, args) = parser.parse_args()
# MozcとMeCabに共通する文脈IDを探す
with codecs.open(options.naist_dic_path+'/left-id.def', "r", options.encode) as naist_id_file:
naist_ids = [l.strip().split(' ') for l in naist_id_file]
with codecs.open(options.ipa_dic_path+'/left-id.def', "r", options.encode) as ipa_id_file:
ipa_ids = [l.strip().split(' ') for l in ipa_id_file]
out_file = codecs.open(options.out_file, "w", options.encode)
errout_file = codecs.open('error.csv', "w", options.encode)
common_ids = {}
for naist_id in naist_ids:
for i in xrange(len(ipa_ids)):
ipa_id = ipa_ids[i]
if naist_id[1] == ipa_id[1]:
common_ids[naist_id[0]] = {'ipa':ipa_id[0],'pos':ipa_id[1],'write_pos':','.join(ipa_id[1].split(',')[:-1])}
break
# 共通する文脈IDの単語をCSVに書きだす
ids = sorted(common_ids.keys(),key=lambda x:int(x))
# for i in xrange(1,1396):
# if not str(i) in ids:
# print i
add_flag = False
for line in [ l.rstrip() for l in codecs.open(options.naist_dic_path+'/naist-jdic.csv', 'r', options.encode)]:
naist_data = line.split(',')
naist_id = naist_data[1]
if naist_id in common_ids:
id = common_ids[naist_id]['ipa']
pos = common_ids[naist_id]['write_pos']
add_flag = True
else:
break_flag = False
for pos in (','.join(naist_data[4:11]), ','.join(naist_data[4:10]), ','.join(naist_data[4:10]).replace(u'連用タ接続',u'連用形')):
for k,v in common_ids.items():
if v['pos'] == pos or v['write_pos'] == pos:
id = v['ipa']
pos = v['write_pos']
add_flag = True
break_flag = True
break
if break_flag:
break
if add_flag:
word = naist_data[0]
cost = naist_data[3]
stem = naist_data[10]
yomi = naist_data[11]
pron = naist_data[12]
etc = naist_data[13]
etc2 = naist_data[14] if len(naist_data) > 14 else ''
out_file.write(u"%s,%s,%s,%s,%s,%s,%s,%s,%s,%s\n" % (word, id, id, cost, pos, stem, yomi, pron, etc, etc2))
add_flag = False
else:
errout_file.write('%s\n' % line)
out_file.close()
errout_file.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment