Last active
January 24, 2017 05:11
-
-
Save ikegami-yukino/40a10a0e9611450b0c2a07ceb94b4b4c to your computer and use it in GitHub Desktop.
UniDicからアルファベット単語と読みのペアを抽出
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import os | |
import glob | |
re_pair = re.compile('^([ァ-ンー]+)\-([a-zA-Z \'\-\(\)]+)') | |
UNIDIC_PATH = 'path to UniDic directory' | |
with open('result.tsv', 'w') as out_fd: | |
for csvfile in glob.glob(os.path.join(UNIDIC_PATH, '*.csv')): | |
with open(csvfile) as dic_fd: | |
prev_alphabets = prev_yomi = '' | |
for line in dic_fd: | |
columns = line.split(',') | |
for (yomi, alphabets) in re_pair.findall(columns[11]): | |
if prev_alphabets == alphabets and len(yomi) <= len(prev_yomi): | |
pass | |
else: | |
out_fd.write('%s\t%s\n' % (alphabets.lower(), yomi)) | |
prev_alphabets = alphabets | |
prev_yomi = yomi |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment