Created
January 12, 2012 15:34
-
-
Save kiwanami/1601147 to your computer and use it in GitHub Desktop.
Emacsで日本語逆変換
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
;;; decompjp.el --- decompose Japanese text. | |
;; Copyright (C) 2011 SAKURAI Masashi | |
;; Author: SAKURAI Masashi <m.sakurai at kiwanami.net> | |
;; Keywords: languages, tools | |
;; This program is free software; you can redistribute it and/or modify | |
;; it under the terms of the GNU General Public License as published by | |
;; the Free Software Foundation, either version 3 of the License, or | |
;; (at your option) any later version. | |
;; This program is distributed in the hope that it will be useful, | |
;; but WITHOUT ANY WARRANTY; without even the implied warranty of | |
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
;; GNU General Public License for more details. | |
;; You should have received a copy of the GNU General Public License | |
;; along with this program. If not, see <http://www.gnu.org/licenses/>. | |
;;; Commentary: | |
;; MeCabなどを利用して、逆変換を試みるプログラム。 | |
;; | |
;; dcj:reverse-translate コマンドを実行すると、現在の位置から適当な位置 | |
;; まで形態素解析して逆変換を試みる。リージョンが指定されてあれば、その | |
;; 範囲を逆変換する。 | |
;; | |
;; ibus-mozc, anthy など、一般的な変換プログラムで利用できるはず。 | |
;;; Code: | |
(eval-when-compile (require 'cl)) | |
;;================================================== | |
;; Customize | |
(defvar dcj:key-commit ?\ "変換開始キー") | |
(defvar dcj:romaji-kana-table | |
'(( "a" . "あ") ( "i" . "い") ( "u" . "う") ( "e" . "え") ( "o" . "お") | |
("ka" . "か") ("ki" . "き") ("ku" . "く") ("ke" . "け") ("ko" . "こ") | |
("sa" . "さ") ("si" . "し") ("su" . "す") ("se" . "せ") ("so" . "そ") | |
("ta" . "た") ("ti" . "ち") ("tu" . "つ") ("te" . "て") ("to" . "と") | |
("na" . "な") ("ni" . "に") ("nu" . "ぬ") ("ne" . "ね") ("no" . "の") | |
("ha" . "は") ("hi" . "ひ") ("hu" . "ふ") ("he" . "へ") ("ho" . "ほ") | |
("ma" . "ま") ("mi" . "み") ("mu" . "む") ("me" . "め") ("mo" . "も") | |
("ya" . "や") ("yu" . "ゆ") ("yo" . "よ") | |
("ra" . "ら") ("ri" . "り") ("ru" . "る") ("re" . "れ") ("ro" . "ろ") | |
("wa" . "わ") ("wi" . "ゐ") ("wu" . "う") ("we" . "ゑ") ("wo" . "を") | |
("n'" . "ん") ( "n" . "ん") ("m'" . "ん") ( "m" . "ん") | |
("ga" . "が") ("gi" . "ぎ") ("gu" . "ぐ") ("ge" . "げ") ("go" . "ご") | |
("za" . "ざ") ("zi" . "じ") ("zu" . "ず") ("ze" . "ぜ") ("zo" . "ぞ") | |
("da" . "だ") ("di" . "ぢ") ("du" . "づ") ("de" . "で") ("do" . "ど") | |
("ba" . "ば") ("bi" . "び") ("bu" . "ぶ") ("be" . "べ") ("bo" . "ぼ") | |
("pa" . "ぱ") ("pi" . "ぴ") ("pu" . "ぷ") ("pe" . "ぺ") ("po" . "ぽ") | |
("kya" . "きゃ") ("kyu" . "きゅ") ("kye" . "きぇ") ("kyo" . "きょ") | |
("sya" . "しゃ") ("syu" . "しゅ") ("sye" . "しぇ") ("syo" . "しょ") | |
("sha" . "しゃ") ("shu" . "しゅ") ("she" . "しぇ") ("sho" . "しょ") | |
("cha" . "ちゃ") ("chu" . "ちゅ") ("che" . "ちぇ") ("cho" . "ちょ") | |
("cya" . "ちゃ") ("cyu" . "ちゅ") ("cye" . "ちぇ") ("cyo" . "ちょ") | |
("tya" . "ちゃ") ("tyu" . "ちゅ") ("tye" . "ちぇ") ("tyo" . "ちょ") | |
("nya" . "にゃ") ("nyu" . "にゅ") ("nye" . "にぇ") ("nyo" . "にょ") | |
("hya" . "ひゃ") ("hyu" . "ひゅ") ("hye" . "ひぇ") ("hyo" . "ひょ") | |
("mya" . "みゃ") ("myu" . "みゅ") ("mye" . "みぇ") ("myo" . "みょ") | |
("rya" . "りゃ") ("ryu" . "りゅ") ("rye" . "りぇ") ("ryo" . "りょ") | |
("lya" . "りゃ") ("lyu" . "りゅ") ("lye" . "りぇ") ("lyo" . "りょ") | |
("gya" . "ぎゃ") ("gyu" . "ぎゅ") ("gye" . "ぎぇ") ("gyo" . "ぎょ") | |
("zya" . "じゃ") ("zyu" . "じゅ") ("zye" . "じぇ") ("zyo" . "じょ") | |
("jya" . "じゃ") ("jyu" . "じゅ") ("jye" . "じぇ") ("jyo" . "じょ") | |
( "ja" . "じゃ") ( "ju" . "じゅ") ( "je" . "じぇ") ( "jo" . "じょ") | |
("bya" . "びゃ") ("byu" . "びゅ") ("bye" . "びぇ") ("byo" . "びょ") | |
("pya" . "ぴゃ") ("pyu" . "ぴゅ") ("pye" . "ぴぇ") ("pyo" . "ぴょ") | |
("kwa" . "くゎ") ("kwi" . "くぃ") ("kwe" . "くぇ") ("kwo" . "くぉ") | |
("tsa" . "つぁ") ("tsi" . "つぃ") ("tse" . "つぇ") ("tso" . "つぉ") | |
( "fa" . "ふぁ") ( "fi" . "ふぃ") ( "fe" . "ふぇ") ( "fo" . "ふぉ") | |
("gwa" . "ぐゎ") ("gwi" . "ぐぃ") ("gwe" . "ぐぇ") ("gwo" . "ぐぉ") | |
("dyi" . "でぃ") ("dyu" . "どぅ") ("dye" . "でぇ") ("dyo" . "どぉ") | |
("dhi" . "でぃ") ("dhu" . "どぅ") ("dhe" . "でぇ") ("dho" . "どぉ") | |
("shi" . "し") ("tyi" . "てぃ") ("thi" . "てぃ") | |
("chi" . "ち") ("tsu" . "つ") ("ji" . "じ") | |
( "fu" . "ふ") ( "ye" . "いぇ") | |
("va" . "ヴぁ") ("vi" . "ヴぃ") ("vu" . "ヴ") ("ve" . "ヴぇ") ("vo" . "ヴぉ") | |
( "xa" . "ぁ") ( "xi" . "ぃ") ( "xu" . "ぅ") ( "xe" . "ぇ") ( "xo" . "ぉ") | |
( "la" . "ぁ") ( "li" . "ぃ") ( "lu" . "ぅ") ( "le" . "ぇ") ("lo" . "ぉ") | |
("xtu" . "っ") ("xya" . "ゃ") ("xyu" . "ゅ") ("xyo" . "ょ") ("xwa" . "ゎ") | |
("ltu" . "っ") ("lya" . "ゃ") ("lyu" . "ゅ") ("lyo" . "ょ") ("lwa" . "ゎ") | |
("xka" . "ヵ") ("xke" . "ヶ") | |
("1" . "1") ("2" . "2") ("3" . "3") ("4" . "4") ("5" . "5") | |
("6" . "6") ("7" . "7") ("8" . "8") ("9" . "9") ("0" . "0") | |
("!" . "!") ("@" . "@") ("#" . "#") ("$" . "$") ("%" . "%") | |
("^" . "^") ("&" . "&") ("*" . "*") ("(" . "(") (")" . ")") | |
("-" . "ー") ("=" . "=") ("`" . "`") ("\\" . "¥") ("|" . "|") | |
("_" . "_") ("+" . "+") ("~" . " ̄") ("[" . "「") ("]" . "」") | |
("{" . "{") ("}" . "}") (":" . ":") (";" . ";") ("\"" . "”") | |
("'" . "’") ("." . "。") ("," . "、") ("<" . "<") (">" . ">") | |
("?" . "?") ("/" . "/") | |
)) | |
(defvar dcj:kana-regexp | |
(regexp-opt | |
(mapcar 'cdr (sort (copy-sequence dcj:romaji-kana-table) | |
(lambda (i j) (> (length (cdr i)) | |
(length (cdr j))))))) | |
"かなを拾う正規表現。 dcj:romaji-kana-table を更新したらこの変数も更新すること。") | |
;;================================================== | |
;; 逆変換 | |
(defvar dcj:reverse-delimiter-regexp | |
(concat "[][\n\r\t " | |
",.?!()-=+|/&$#@~:;\"{}<>" | |
"、。 ※・?!()=+|&$#@:;”’" | |
"{}<>「」【】○●◎■□◇◆△▲▽▼]") | |
"逆変換の先頭を探す目印の正規表現") | |
(defvar dcj:reverse-through-delimiters | |
"、。※・?!()=+|&$#@:;”’{}<>「」【】○●◎■□◇◆△▲▽▼" | |
"直前ならやっぱり逆変換に含めたい文字列") | |
(defun dcj:reverse-search-begin () | |
"現在のポイント位置から前に走査して、逆変換を開始する位置を返す。 | |
見つからなかったら現在のポイント位置を返す。" | |
(save-excursion | |
(when (and ; 直前の文字が through-delimiters に含まれていれば無視する | |
(char-before) | |
(loop with prechar = (char-before) | |
for i across dcj:reverse-through-delimiters | |
if (eql i prechar) | |
return t finally return nil)) | |
(goto-char (1- (point)))) | |
(or | |
(and (re-search-backward dcj:reverse-delimiter-regexp nil t) | |
(1+ (point))) | |
(line-beginning-position)))) | |
(defvar dcj:reverse-translate-driver 'dcj:reverse-translate-driver-mecab "形態素解析に何を使うか。 | |
今のところ 'dcj:reverse-translate-driver-mecab / 'dcj:reverse-translate-driver-chasen が使える。 | |
ドライバーは変換済み文字列を受け取って、元の単語と読みのひらがなの alist を返す。") | |
(defun dcj:reverse-translate-driver-mecab (kanji) | |
"MeCab driver" | |
(let (result) | |
(with-temp-buffer | |
(call-process-shell-command | |
(format "echo '%s' | mecab " kanji) nil t) | |
(goto-char (point-min)) | |
(while (re-search-forward "^\\([^\t]+\\)\t\\(.*\\)$" nil t) | |
(let* ((org (match-string 1)) | |
(cols (match-string 2)) | |
(yomi (nth 7 (split-string cols ",")))) | |
(push (cons org | |
(japanese-hiragana | |
(if yomi yomi org))) | |
result)))) | |
(nreverse result))) | |
(defun dcj:reverse-translate-driver-chasen (kanji) | |
"ChaSen driver" | |
(let (result) | |
(with-temp-buffer | |
(call-process-shell-command | |
(format "echo '%s' | chasen " kanji) nil t) | |
(goto-char (point-min)) | |
(while (re-search-forward "^\\([^\t]+\\)\t\\([^\t]*\\)\t.*$" nil t) | |
(let* ((org (match-string 1)) | |
(yomi (match-string 2))) | |
(push (cons org | |
(japanese-hiragana | |
(if (or (string-match "[0-9a-zA-Z]" org) | |
(or (null yomi) (= 0 (length yomi)))) | |
org yomi))) | |
result)))) | |
(nreverse result))) | |
(defun dcj:reverse-translate-driver (kanji) | |
"漢字からひらがな・単語区切りにする。実際にはドライバーに丸投げする。" | |
(funcall dcj:reverse-translate-driver kanji)) | |
(defun dcj:reverse-kanji-to-kana (kanji) | |
"漢字からひらがなにする。" | |
(let ((words (dcj:reverse-translate-driver kanji))) | |
(when words | |
(loop for (org . kana) in words | |
concat kana)))) | |
(defun dcj:reverse-kana-to-romaji (source) | |
"アルファベットローマ字にする。" | |
(with-temp-buffer | |
(insert source) | |
(goto-char (point-min)) | |
(while (re-search-forward dcj:kana-regexp nil t) | |
(let ((match (match-string 0))) | |
(replace-match | |
(car (rassoc match dcj:romaji-kana-table))))) | |
(buffer-string))) | |
(defun dcj:reverse-translate () | |
"現在のポイント位置から逆変換を試みる" | |
(interactive) | |
(let ((poss (if (region-active-p) | |
(min (mark) (point)) | |
(dcj:reverse-search-begin))) | |
(pose (if (region-active-p) | |
(max (mark) (point)) (point))) | |
org romaji) | |
(when (< poss pose) | |
(setq org (buffer-substring poss pose) | |
romaji (dcj:reverse-kana-to-romaji | |
(dcj:reverse-kanji-to-kana org))) | |
(delete-region poss pose) | |
(setq unread-command-events | |
(append unread-command-events | |
(listify-key-sequence romaji) (list dcj:key-commit)))))) | |
(defun dcj:reverse-test () | |
"逆変換のテスト" | |
(interactive) | |
(pop-to-buffer "*dcj-kanji-to-romaji-test*") | |
(erase-buffer) | |
(loop for q in '(dcj:reverse-translate-driver-mecab) ; '(dcj:reverse-translate-driver-mecab dcj:reverse-translate-driver-chasen) | |
do | |
(let ((dcj:reverse-translate-driver q)) | |
(insert "==(" (symbol-name q) ")==========\n") | |
(loop for (in out) in | |
'(("太郎は花子が好きだ" "tarouhahanakogasukida") ; 普通 | |
("今日は良い天気ですね" "kyouhayoiten'kidesune") ; ん | |
("逆変換を試みる" "gyakuhen'kan'wokokoromiru") ; ん | |
("東京特許許可局" "toukyoutoxtukyokyokakyoku") ; 小さいつ | |
("1太郎は花子が好きだ" "1tarouhahanakogasukida") ; 数字込み | |
("デフォルトのカーソールの色" "deforutonoka-so-runoiro") ; 未知語 | |
("AはBを含む。" "AhaBwohukumu.") ; アルファベット込み | |
("MySQLにJSONを登録する。" "MySQLniJSONwotourokusuru.")) ; 英単語込み | |
for kana = (dcj:reverse-kanji-to-kana in) | |
for romaji = (dcj:reverse-kana-to-romaji kana) | |
do | |
(insert (format "%s : %s / %s / %s <= %s\n" | |
(if (equal romaji out) "OK" "NG") | |
out romaji kana in)))))) | |
;; (progn (eval-current-buffer) (dcj:reverse-test)) | |
(provide 'decompjp) | |
;;; decompjp.el ends here |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment