Last active
May 20, 2019 04:50
-
-
Save yokolet/cafd4e87115e9c9720a053063853c3d5 to your computer and use it in GitHub Desktop.
katakana transcription
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# https://en.wikibooks.org/wiki/Japanese/Transcribing_English_to_Japanese | |
import os | |
import sys | |
sys.path.append(os.path.abspath('./English-to-IPA')) | |
from pykakasi import kakasi | |
import romkan | |
import eng_to_ipa as ipa | |
import re | |
vowels = 'aeiou' | |
vowsym = 'ɑiɪɛɜæʌɒɔʊuəeajo' | |
conssym = 'θðlŋjhwvʃʧtdkpgm' | |
def a_short_rule(word, w_idx): | |
# ɑ - a | |
if word[w_idx] == 'o': | |
return 'o' | |
else: | |
return 'a' | |
def a_long_rule(word, w_idx): | |
# ɑː or ar - aa, a | |
return 'aa' | |
def i_long_rule(word, w_idx): | |
# iː or i | |
return 'ii' | |
def i_short_rule(word, w_idx): | |
# ɪ | |
return 'i' | |
def e_short_rule(word, w_idx): | |
# ɛ | |
return 'e' | |
def e_long_rule(word, w_idx): | |
# ɜː - aa, a | |
return 'aa' | |
def ae_rule(word, w_idx): | |
# æ | |
if w_idx >= 1 and (word[w_idx-1] == 'c' or word[w_idx-1] == 'g'): | |
return 'ya' | |
else: | |
return 'a' | |
def hat_rule(word, w_idx): | |
# ʌ | |
if word[w_idx] == 'o': | |
return 'o' | |
else: | |
return 'a' | |
def o_short_rule(word, w_idx): | |
# ɒ | |
return 'o' | |
def o_long_rule(word, w_idx): | |
# ɔː | |
return 'oo' | |
def u_short_rule(word, w_idx): | |
# ʊ | |
return 'u' | |
def u_long_rule(word, w_idx): | |
# uː | |
return 'uu' | |
def e2_rule(word, w_idx): | |
# ə | |
#print('e2_rule', word, w_idx) | |
if w_idx >= 2 and \ | |
word[w_idx-2:w_idx] in ['bl', 'gl']: | |
return 'u' | |
elif w_idx == len(word)-1: | |
return 'a' | |
elif word[w_idx] == 'o': | |
if w_idx < len(word)-1 and word[w_idx+1] == 'u': | |
return 'a' | |
else: | |
return 'o' | |
else: | |
return 'a' | |
def ei_rule(word, w_idx): | |
# eɪ or e - ei, ee, e | |
if w_idx < len(word)-1 and word[w_idx+1] == 'y': | |
return 'ei' | |
else: | |
return 'e' | |
def ai_rule(word, w_idx): | |
# aɪ | |
return 'ai' | |
def oi_rule(word, w_idx): | |
# ɔɪ - ooi, oi | |
return 'ooi' | |
def eu_rule(word, w_idx): | |
# əʊ - o, oo | |
return 'oo' | |
def au_rule(word, w_idx): | |
# aʊ | |
return 'au' | |
def ie_rule(word, w_idx): | |
# ɪə - ia, iaa | |
return 'iaa' | |
def ee_rule(word, w_idx): | |
# ɛə - ea, eaa | |
return 'eaa' | |
def ue_rule(word, w_idx): | |
# ʊə | |
return 'uaa' | |
def ju_long_rule(word, w_idx): | |
# juː | |
return 'yuu' | |
def jvow_rule(word, w_idx): | |
# juː | |
return 'j' | |
vowel_map = { | |
'ɑ': a_short_rule, | |
'ɑː': a_long_rule, | |
'ɑr': a_long_rule, | |
'iː': i_long_rule, | |
'i': i_long_rule, | |
'ɪ': i_short_rule, | |
'ɛ': e_short_rule, | |
'ɜː': e_long_rule, | |
'æ': ae_rule, | |
'ʌ': hat_rule, | |
'ɒ': o_short_rule, | |
'ɔː': o_long_rule, | |
'ɔ': o_long_rule, | |
'ɔr': o_long_rule, | |
'ʊ': u_short_rule, | |
'uː': u_long_rule, | |
'u': u_long_rule, | |
'ə': e2_rule, | |
'ər': a_long_rule, | |
'eɪ': ei_rule, | |
'e': ei_rule, | |
'aɪ': ai_rule, | |
'ɔɪ': oi_rule, | |
'əʊ': eu_rule, | |
'oʊ': eu_rule, | |
'aʊ': au_rule, | |
'ɪə': ie_rule, | |
'ɪr': ie_rule, | |
'ɛə': ee_rule, | |
'ɛr': ee_rule, | |
'ʊə': ue_rule, | |
'ʊr': ue_rule, | |
'juː': ju_long_rule, | |
'ju': ju_long_rule, | |
'j': jvow_rule, | |
} | |
# æ after k => kya | |
# æ after g => gya | |
# ʌ spelt with an "o" => o, ex: monkey, front | |
# non-final ə => ?, ex: about, pilot, london | |
# final position ə spelt as "-r" => aa | |
# final position ə spelt with an "a" => a | |
def transVowel(word, ph): | |
result = '' | |
w_idx = 0 | |
p_idx = 0 | |
while w_idx < len(word) or p_idx < len(ph): | |
# consonant: adds as is | |
while w_idx < len(word) and word[w_idx] not in vowels: | |
#result += word[w_idx] | |
w_idx += 1 | |
# consonant phonetics: skips for now | |
while p_idx < len(ph) and ph[p_idx] not in vowsym: | |
result += ph[p_idx] | |
p_idx += 1 | |
# checks vowel phonetics | |
if p_idx+3 <= len(ph) and ph[p_idx:p_idx+3] == 'juː': | |
result += vowel_map['juː'](word, w_idx) | |
p_idx += 3 | |
elif p_idx+2 <= len(ph) and ph[p_idx:p_idx+2] in vowel_map: | |
result += vowel_map[ph[p_idx:p_idx+2]](word, w_idx) | |
p_idx += 2 | |
elif p_idx < len(ph): | |
result += vowel_map[ph[p_idx]](word, w_idx) | |
p_idx += 1 | |
# vowel chars may be more than one in word, skips those | |
while w_idx < len(word) and word[w_idx] in vowels: | |
w_idx += 1 | |
return result | |
def th_clear_rule(word, ph, p_idx): | |
# θ | |
return 's' | |
def th_hakuon_rule(word, ph, p_idx): | |
# ð | |
return 'z' | |
def l_rule(word, ph, p_idx): | |
# l | |
return 'r' | |
def n_rule(word, ph, p_idx): | |
# ŋ - Ng, N | |
if 'ng' in word: | |
return 'Ng' | |
else: | |
return 'N' | |
def jcon_rule(word, ph, p_idx): | |
# j (before the sounds i, ɪ, or e) | |
if p_idx < len(ph)-1 and ph[p_idx+1] in 'iɪe': | |
return 'i' | |
else: | |
return 'j' | |
def h_rule(word, ph, p_idx): | |
# h (before the sounds u or ʊ) | |
if p_idx < len(ph)-1 and ph[p_idx+1] in 'uʊ': | |
return 'f' | |
else: | |
return 'h' | |
def w_rule(word, ph, p_idx): | |
if p_idx == 0 and word[0:2] == 'wh': | |
return 'how' | |
else: | |
return 'u' | |
def v_rule(word, ph, p_idx): | |
return 'b' | |
def s_rule(word, ph, p_idx): | |
# ʃ | |
if len(ph) >= 2 and ph[-1] == 'ʃ' and ph[-2] in vowsym: | |
return 'sshu' | |
else: | |
return 'sh' | |
def ts_rule(word, ph, p_idx): | |
# ʧ | |
if p_idx >= 1 and ph[p_idx] == 'ʧ' and \ | |
ph[p_idx-1] in vowsym and \ | |
(len(ph) <= 2 or ph[p_idx-2] not in vowsym): | |
return 'cchi' | |
else: | |
return 'ch' | |
def t_rule(word, ph, p_idx): | |
# t - tto | |
if p_idx >= 1 and ph[p_idx] == 't' and \ | |
ph[p_idx-1] in vowsym and \ | |
(len(ph) <= 2 or ph[p_idx-2] not in vowsym): | |
return 'tto' | |
else: | |
return 't' | |
def d_rule(word, ph, p_idx): | |
# d in the end - ddo | |
if p_idx < len(ph)-1 and ph[p_idx+1] == 'z': | |
return 'z' | |
elif p_idx >= 1 and ph[p_idx] == 'd' and \ | |
ph[p_idx-1] in vowsym and \ | |
(len(ph) <= 2 or ph[p_idx-2] not in vowsym): | |
return 'ddo' | |
else: | |
return 'd' | |
def kpg_rule(word, ph, p_idx): | |
# k, p -- kku, ppu | |
if p_idx >= 1 and ph[p_idx] in 'kpg' and \ | |
ph[p_idx-1] in vowsym and \ | |
(len(ph) <= 2 or ph[p_idx-2] not in vowsym): | |
return ph[p_idx]+ph[p_idx]+'u' | |
else: | |
return ph[p_idx] | |
def m_rule(word, ph, p_idx): | |
# m not followed by vowel - n | |
if p_idx < len(ph)-1 and ph[p_idx+1] not in vowsym: | |
return 'n' | |
else: | |
return 'm' | |
consonant_map = { | |
'θ': th_clear_rule, | |
'ð': th_hakuon_rule, | |
'l': l_rule, | |
'ŋ': n_rule, | |
'j': jcon_rule, | |
'h': h_rule, | |
'w': w_rule, | |
'v': v_rule, | |
'ʃ': s_rule, | |
'ʧ': ts_rule, | |
't': t_rule, | |
'd': d_rule, | |
'k': kpg_rule, | |
'p': kpg_rule, | |
'g': kpg_rule, | |
'm': m_rule | |
} | |
def transConsonants(word, step2): | |
idx = 0 | |
result = '' | |
while idx < len(step2): | |
while idx < len(step2) and step2[idx] not in conssym: | |
result += step2[idx] | |
idx += 1 | |
if idx < len(step2): | |
result += consonant_map[step2[idx]](word, step2, idx) | |
idx += 1 | |
return result | |
def transcribe(word): | |
# step 1: make phonetic | |
ph = ipa.convert(word) | |
# step 2: convert vowels | |
step2 = transVowel(word, ph) | |
#print(word, ph, step2) | |
# step 3: convert consonants | |
step3 = transConsonants(word, step2) | |
print(word, ph, step2, step3) | |
# step 4: add epenthtic vowels |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment