Last active
June 17, 2022 10:42
-
-
Save ooharak/b14753d537b570f934947c8e17595097 to your computer and use it in GitHub Desktop.
「あ・い・う・え・お」の各段が一音ずつ含まれる単語を抽出
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# -*- coding: utf-8 -*- | |
# | |
# 松下達彦(2011) 「日本語を読むための語彙データベース(VDRJ) Ver.1.0 (研究用)」 | |
# http://www17408ui.sakura.ne.jp/tatsum/database/VDRJ_Ver1_1_Research_Top60894.xlsx | |
# ダウンロード:2022年6月17日 | |
# | |
# から「あ・い・う・え・お」の各段が一音ずつ含まれる単語を抽出する。 | |
# (動詞・音引きは除外 。 促音は許容) | |
from openpyxl import load_workbook | |
import pykakasi | |
kks = pykakasi.kakasi() | |
docdir = './' | |
vdrj_filename = docdir + 'VDRJ_Ver1_1_Research_Top60894.xlsx' | |
vowels = ['a','e','i','o','u'] | |
wb = load_workbook(filename = vdrj_filename, read_only=True) | |
try: | |
ws = wb['重要度順語彙リスト60894語'] | |
for row in ws.rows: | |
kanji = row[13].value | |
kana = row[15].value | |
hinshi = row[16].value | |
if ( | |
kana | |
and (not hinshi.startswith('動詞')) | |
and (kana.count('ン') == 0) | |
): | |
result = kks.convert(kana) | |
roma = result[0]['hepburn'] | |
if all([roma.count(c)==1 for c in vowels]): | |
print(kanji) | |
finally: | |
wb.close | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment