|
# -*- coding: utf-8 -*- |
|
# Copyright: Damien Elmes <[email protected]> |
|
# License: GNU GPL, version 3 or later; http://www.gnu.org/copyleft/gpl.html |
|
# |
|
# Automatic reading generation with kakasi and mecab. |
|
# See http://ichi2.net/anki/wiki/JapaneseSupport |
|
# |
|
|
|
""" |
|
|
|
Standalone Japanese reading generator based on Damien Elmes' Japanese Support |
|
plugin for Anki: https://ankiweb.net/shared/info/3918629684 |
|
|
|
This script does *not* need Anki to run, but it does use the MeCab and Kakasi |
|
applications included in the Japanese Support plugin. You are expected to |
|
download that plugin and place this file inside your |
|
<Documents/Anki/addons/japanese/> directory alongside `reading.py` and run it |
|
from the command line. |
|
|
|
|
|
|
|
Usage: at the command prompt, run: |
|
|
|
python readingStandAlone.py inputFile [outputFile [formatter]] |
|
|
|
- inputFile should be UTF-8 encoded. |
|
|
|
- outputFile: optional. Output will be written to this file if provided (UTF-8 |
|
- encoded), or written to screen if not provided. |
|
|
|
- formatter: optional. If omitted or with formatter="defaultFormatter", this |
|
script will put a space before Japanese words, and the hiragana reading in |
|
[square-brackets] immediately after the word. E.g., the following input |
|
sentence: |
|
|
|
お父さんは? |
|
|
|
becomes |
|
|
|
お 父[とう]さんは? |
|
|
|
With formatter="verboseFormatter", the following will be produced: |
|
|
|
お_{父}[とう]さんは? |
|
|
|
Note how the prefix space is replaced by an underscore "_", and the Japanese |
|
word (in this case, just one kanji, but potentially more) is put in {curly |
|
brackets}. |
|
|
|
You can add other formatters to the source code: they should be functions of |
|
two arguments and one optional argument, i.e., with the following definition: |
|
|
|
def newFormatter(kanji, reading, optionalReading=""): |
|
|
|
Caveat: no HTML stripping available. |
|
""" |
|
|
|
import sys, os, platform, re, subprocess |
|
#from anki.utils import stripHTML, isWin, isMac |
|
#from anki.hooks import addHook |
|
isMac = sys.platform.startswith("darwin") |
|
isWin = sys.platform.startswith("win32") |
|
def stripHTML(s): return s |
|
|
|
def verboseFormatter(kanji, reading, optionalReading=""): |
|
return "_{%s}[%s]%s" % (kanji, reading, optionalReading) |
|
def defaultFormatter(kanji, reading, optionalReading=""): |
|
return " %s[%s]%s" % (kanji, reading, optionalReading) |
|
|
|
|
|
kakasiArgs = ["-isjis", "-osjis", "-u", "-JH", "-KH"] |
|
mecabArgs = ['--node-format=%m[%f[7]] ', '--eos-format=\n', |
|
'--unk-format=%m[] '] |
|
|
|
def escapeText(text): |
|
# strip characters that trip up kakasi/mecab |
|
text = text.replace("\n", " ") |
|
text = text.replace(u'\uff5e', "~") |
|
text = re.sub("<br( /)?>", "---newline---", text) |
|
text = stripHTML(text) |
|
text = text.replace("---newline---", "<br>") |
|
return text |
|
|
|
if sys.platform == "win32": |
|
si = subprocess.STARTUPINFO() |
|
try: |
|
si.dwFlags |= subprocess.STARTF_USESHOWWINDOW |
|
except: |
|
si.dwFlags |= subprocess._subprocess.STARTF_USESHOWWINDOW |
|
else: |
|
si = None |
|
|
|
# Mecab |
|
########################################################################## |
|
|
|
def mungeForPlatform(popen): |
|
if isWin: |
|
popen = [os.path.normpath(x) for x in popen] |
|
popen[0] += ".exe" |
|
elif not isMac: |
|
popen[0] += ".lin" |
|
return popen |
|
|
|
class MecabController(object): |
|
|
|
def __init__(self): |
|
self.mecab = None |
|
|
|
def setup(self): |
|
base = "../../addons/japanese/support/" |
|
self.mecabCmd = mungeForPlatform( |
|
[base + "mecab"] + mecabArgs + [ |
|
'-d', base, '-r', base + "mecabrc"]) |
|
os.environ['DYLD_LIBRARY_PATH'] = base |
|
os.environ['LD_LIBRARY_PATH'] = base |
|
if not isWin: |
|
os.chmod(self.mecabCmd[0], 0755) |
|
|
|
def ensureOpen(self): |
|
if not self.mecab: |
|
self.setup() |
|
try: |
|
self.mecab = subprocess.Popen( |
|
self.mecabCmd, bufsize=-1, stdin=subprocess.PIPE, |
|
stdout=subprocess.PIPE, stderr=subprocess.STDOUT, |
|
startupinfo=si) |
|
except OSError: |
|
raise Exception("Please ensure your Linux system has 32 bit binary support.") |
|
|
|
def reading(self, expr, formatter=defaultFormatter): |
|
self.ensureOpen() |
|
expr = escapeText(expr) |
|
self.mecab.stdin.write(expr.encode("euc-jp", "ignore")+'\n') |
|
self.mecab.stdin.flush() |
|
expr = unicode(self.mecab.stdout.readline().rstrip('\r\n'), "euc-jp") |
|
out = [] |
|
for node in expr.split(" "): |
|
if not node: |
|
break |
|
(kanji, reading) = re.match("(.+)\[(.*)\]", node).groups() |
|
# hiragana, punctuation, not japanese, or lacking a reading |
|
if kanji == reading or not reading: |
|
out.append(kanji) |
|
continue |
|
# katakana |
|
if kanji == kakasi.reading(reading): |
|
out.append(kanji) |
|
continue |
|
# convert to hiragana |
|
reading = kakasi.reading(reading) |
|
# ended up the same |
|
if reading == kanji: |
|
out.append(kanji) |
|
continue |
|
# don't add readings of numbers |
|
if kanji in u"一二三四五六七八九十0123456789": |
|
out.append(kanji) |
|
continue |
|
# strip matching characters and beginning and end of reading and kanji |
|
# reading should always be at least as long as the kanji |
|
placeL = 0 |
|
placeR = 0 |
|
for i in range(1,len(kanji)): |
|
if kanji[-i] != reading[-i]: |
|
break |
|
placeR = i |
|
for i in range(0,len(kanji)-1): |
|
if kanji[i] != reading[i]: |
|
break |
|
placeL = i+1 |
|
if placeL == 0: |
|
if placeR == 0: |
|
out.append(formatter(kanji, reading)) |
|
else: |
|
out.append(formatter( |
|
kanji[:-placeR], reading[:-placeR], reading[-placeR:])) |
|
else: |
|
if placeR == 0: |
|
out.append("%s%s" % ( |
|
reading[:placeL], formatter(kanji[placeL:], reading[placeL:]))) |
|
else: |
|
out.append("%s%s" % ( |
|
reading[:placeL], formatter(kanji[placeL:-placeR], |
|
reading[placeL:-placeR], reading[-placeR:]))) |
|
fin = u"" |
|
for c, s in enumerate(out): |
|
if c < len(out) - 1 and re.match("^[A-Za-z0-9]+$", out[c+1]): |
|
s += " " |
|
fin += s |
|
return fin.strip().replace("< br>", "<br>") |
|
|
|
# Kakasi |
|
########################################################################## |
|
|
|
class KakasiController(object): |
|
|
|
def __init__(self): |
|
self.kakasi = None |
|
|
|
def setup(self): |
|
base = "../../addons/japanese/support/" |
|
self.kakasiCmd = mungeForPlatform( |
|
[base + "kakasi"] + kakasiArgs) |
|
os.environ['ITAIJIDICT'] = base + "itaijidict" |
|
os.environ['KANWADICT'] = base + "kanwadict" |
|
if not isWin: |
|
os.chmod(self.kakasiCmd[0], 0755) |
|
|
|
def ensureOpen(self): |
|
if not self.kakasi: |
|
self.setup() |
|
try: |
|
self.kakasi = subprocess.Popen( |
|
self.kakasiCmd, bufsize=-1, stdin=subprocess.PIPE, |
|
stdout=subprocess.PIPE, stderr=subprocess.STDOUT, |
|
startupinfo=si) |
|
except OSError: |
|
raise Exception("Please install kakasi") |
|
|
|
def reading(self, expr): |
|
self.ensureOpen() |
|
expr = escapeText(expr) |
|
self.kakasi.stdin.write(expr.encode("sjis", "ignore")+'\n') |
|
self.kakasi.stdin.flush() |
|
res = unicode(self.kakasi.stdout.readline().rstrip('\r\n'), "sjis") |
|
return res |
|
|
|
# Init |
|
########################################################################## |
|
|
|
kakasi = KakasiController() |
|
mecab = MecabController() |
|
|
|
# Tests |
|
########################################################################## |
|
|
|
if __name__ == "__main__": |
|
if len(sys.argv) == 1: |
|
expr = u"カリン、自分でまいた種は自分で刈り取れ" |
|
print mecab.reading(expr).encode("utf-8") |
|
expr = u"昨日、林檎を2個買った。" |
|
print mecab.reading(expr) |
|
expr = u"真莉、大好きだよん^^" |
|
print mecab.reading(expr) |
|
expr = u"彼2000万も使った。" |
|
print mecab.reading(expr) |
|
expr = u"彼二千三百六十円も使った。" |
|
print mecab.reading(expr) |
|
expr = u"千葉" |
|
print mecab.reading(expr) |
|
|
|
print "\n" |
|
print "Usage: python readingStandAlone.py inputFile [outputFile [formatter]]" |
|
elif len(sys.argv) >= 2: |
|
import codecs |
|
|
|
stdout = True |
|
if len(sys.argv) >= 3: |
|
stdout = False |
|
outfid = codecs.open(sys.argv[2], "w", "utf8") |
|
else: |
|
outfid = sys.stdout |
|
|
|
formatter = defaultFormatter |
|
if len(sys.argv) >= 4: |
|
if sys.argv[3] == "verboseFormatter": |
|
formatter = verboseFormatter |
|
# If you make new formatters, add to this switch case |
|
|
|
with codecs.open(sys.argv[1], "r", "utf8") as fid: |
|
allLines = fid.readlines() |
|
|
|
for s in allLines: |
|
outfid.write(mecab.reading(s, formatter)) |
|
outfid.write('\n') |
|
|
|
if stdout: |
|
outfid.close() |
|
|
|
|
|
|