Created
August 13, 2015 22:34
-
-
Save takehiko/816f735182780b690f94 to your computer and use it in GitHub Desktop.
Romanization Rule into Trie
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
# kanatrie.rb : ローマ字の綴りをトライ木に(ヘボン式・訓令式・日本式に対応) | |
# by takehikom | |
# required: KAKASI, Graphviz | |
def create_romaji_kana_hash(option = {}) | |
if option[:label] | |
puts "==== #{option[:label]} ====" | |
end | |
str1 = "あいうえおかきくけこさしすせそたちつてとなにぬねのはひふへほまみむめもやゆよらりるれろわをんがぎぐげござじずぜぞだぢづでどばびぶべぼぱぴぷぺぽ" | |
str2 = "きしちにひみりぎじぢびぴ" | |
open("kana.txt", "w") do |f_out| | |
f_out.puts str1.split(//) | |
f_out.puts str2.split(//).map {|c| [c + "ゃ", c + "ゅ", c + "ょ"]} | |
end | |
command = "kakasi -Ha -r -iutf8 <kana.txt >romaji.txt" | |
command.sub!(/-r/, "-rkunrei") if option[:kunrei] | |
puts command | |
system command | |
a1 = open("kana.txt").read.split(/\n/) | |
a2 = open("romaji.txt").read.split(/\n/) | |
r2c_h = {} | |
c2r_h = {} | |
a1.each_with_index do |c1, i| | |
c2 = a2[i] | |
if option[:kunrei] && !option[:di] | |
if c1 == "ぢ" | |
c2 = "zi" | |
elsif c1 == "づ" | |
c2 = "zu" | |
end | |
end | |
if !option[:wo] && c1 == "を" | |
c2 = "o" | |
end | |
puts "#{c2} => #{c1}" | |
if r2c_h.key?(c2) | |
r2c_h[c2] = r2c_h[c2] + "," + c1 | |
puts "(#{c2} => #{r2c_h[c2]})" | |
else | |
r2c_h[c2] = c1 | |
end | |
c2r_h[c1] = c2 | |
end | |
File.unlink("kana.txt") | |
File.unlink("romaji.txt") | |
[r2c_h, c2r_h] | |
end | |
rk_h = {} | |
rk_h[:hepburn] = create_romaji_kana_hash(:ji => true, :label => "hepburn") | |
rk_h[:kunrei] = create_romaji_kana_hash(:kunrei => true, :label => "kunrei") | |
rk_h[:nippon] = create_romaji_kana_hash(:kunrei => true, :di => true, :wo => true, :label => "nippon") | |
# exit | |
def generate_trie(c2r_h, basename, label) | |
puts "start: #{label}" | |
open("#{basename}.dot", "w") do |f_out| | |
f_out.puts "digraph #{basename} {" | |
f_out.puts " graph [rankdir = LR, label = \"#{label}\"];" | |
node_a = [] | |
node_a << "empty [label = \"\"];" | |
path_a = [] | |
romend_a = [] | |
c_keys = c2r_h.keys.sort_by {|key| | |
case key[0] | |
when /[あいうえおかきくけこさしすせそたちつてとなにぬねのはひふへほまみむめもやゆよらりるれろわをん]/ | |
"1" + key | |
when /[がぎぐげござじずぜぞだぢづでどばびぶべぼ]/ | |
"2" + key | |
when /[ぱぴぷぺぽ]/ | |
"3" + key | |
else | |
"9" + key | |
end | |
} | |
c_keys.each do |c| | |
rom = c2r_h[c] | |
romseq_a = (1..(rom.length)).to_a.map {|i| rom[0, i]} | |
romend = "x_" + c.unpack("H*").first | |
romend_a << romend | |
node_a << "#{romend} [shape = box, label = \"#{c}\"];" | |
path_a << "empty -> #{romseq_a.first};" | |
0.upto(romseq_a.length - 2) do |i| | |
path_a << [romseq_a[i], romseq_a[i + 1]].join(" -> ") + ";" | |
end | |
path_a << "#{rom} -> #{romend} [dir = none, style = dotted];" | |
end | |
node_a.uniq! | |
path_a.uniq! | |
f_out.puts | |
f_out.puts node_a.map {|item| " " + item} | |
f_out.puts | |
f_out.puts path_a.map {|item| " " + item} | |
f_out.puts | |
f_out.puts " {rank = same; #{romend_a.join('; ')}}" | |
f_out.puts "}" | |
end | |
command = "dot -Tpng #{basename}.dot -o #{basename}.png" | |
puts command | |
system command | |
end | |
label = "ヘボン式ローマ字のトライ木" | |
basename = "kanatrie_h" | |
generate_trie(rk_h[:hepburn][1], basename, label) | |
label = "訓令式ローマ字のトライ木" | |
basename = "kanatrie_k" | |
generate_trie(rk_h[:kunrei][1], basename, label) | |
label = "日本式ローマ字のトライ木" | |
basename = "kanatrie_n" | |
generate_trie(rk_h[:nippon][1], basename, label) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment