Created
April 13, 2011 15:57
-
-
Save youzaka/917818 to your computer and use it in GitHub Desktop.
mecab-ipadicから捨て仮名だけで構成されている語彙を抽出
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python2.6 | |
# -*- coding: utf-8 -*- | |
import codecs | |
import csv | |
import os | |
import re | |
import sys | |
directory = sys.argv[1] | |
# "ㇷ゚"が2文字扱いされるので、一旦削除 | |
motoneta = list(u'アイウエオカクケシスツトヌハヒフヘホムヤユヨラリルレロワ') | |
sutegana = list(u'ァィゥェォヵㇰヶㇱㇲッㇳㇴㇵㇶㇷㇸㇹㇺャュョㇻㇼㇽㇾㇿヮ') | |
table = dict(zip(motoneta, sutegana)) | |
to_sutegana = lambda x: ''.join([table.get(char, char) for char in x]) | |
regex = re.compile(u'^[%s]+$' % ''.join(motoneta)) | |
total = set() | |
count = dict() | |
os.chdir(directory) | |
for name in os.listdir(directory): | |
if not name.endswith('.csv'): | |
continue | |
reader = csv.reader(codecs.open(name, 'r', 'euc_jp')) | |
for item in reader: | |
item = map(unicode, item) | |
total.add(item[0]) | |
if regex.match(item[11]) and item[9] in (u'基本形', u'*') : | |
count[item[0]] = item[11] | |
for k, v in sorted(count.items(), key=lambda x: len(x[1])): | |
print k, v, to_sutegana(v) | |
print "%d / %d = %f %%" % (len(count), len(total), len(count) * 100.0 / len(total)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment