Created
August 1, 2011 03:53
-
-
Save tily/1117548 to your computer and use it in GitHub Desktop.
日本語テキストから脚韻を抽出するスクリプト (まとめ)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
# Usage: ruby extract_rhyme.rb [-m (vowel|vowel_with_help|surface)] -n N /path/to/file.txt | |
# Example: ruby extract_rhyme.rb -m vowel -n 3 坊ちゃん.txt | |
%w|optparse MeCab|.each{|x| require x} | |
def main(args) | |
opts, args = parse_args(args) | |
rhyme = {} | |
File.open(args[0]).each do |line| | |
node_list = get_node_list(line) | |
node_list.each do |node| | |
yomi = node[:feature].split(',').last | |
if yomi == '*' && JaSound.only_kana?(node[:surface]) | |
yomi = node[:surface] | |
end | |
sound_list = JaSound.split(yomi) | |
if opts[:mode] == 'vowel' | |
sound_list = sound_list.map {|s| JaSound.to_vowel(s) } | |
elsif opts[:mode] == 'vowel_with_help' | |
sound_list = sound_list.map {|s| JaSound.to_vowel_with_help(s) } | |
end | |
if sound_list.size >= opts[:num] | |
tail = sound_list[-opts[:num], opts[:num]].join('') | |
rhyme[tail] ||= [] | |
if !rhyme[tail].include?(node[:surface]) | |
rhyme[tail].push(node[:surface]) | |
end | |
end | |
end | |
end | |
rhyme.keys.sort.each do |k| | |
if rhyme[k].size > 2 | |
puts "#{k}: #{rhyme[k].join(' | ')}" | |
end | |
end | |
end | |
def parse_args(args) | |
opts = {} | |
OptionParser.new do |opt| | |
opt.on("-m MODE", String ) {|v| opts[:mode] = v } | |
opt.on("-n NUM" , Integer) {|v| opts[:num] = v } | |
opt.parse!(args) | |
end | |
[opts, args] | |
end | |
def get_node_list(text) | |
list = [] | |
tagger = MeCab::Tagger.new | |
node = tagger.parseToNode(text) | |
while node = node.next | |
list << {:surface => node.surface, :feature => node.feature} | |
end | |
list | |
end | |
class JaSound | |
LARGE_MAP = { | |
%w|ア カ サ タ ナ ハ マ ヤ ラ ワ ガ ザ ダ バ パ| => 'ア', | |
%w|イ キ シ チ ニ ヒ ミ リ ヰ ギ ジ ヂ ビ ピ| => 'イ', | |
%w|ウ ク ス ツ ヌ フ ム ユ ル ヴ グ ズ ヅ ブ プ| => 'ウ', | |
%w|エ ケ セ テ ネ ヘ メ レ ヱ ゲ ゼ デ ベ ペ| => 'エ', | |
%w|オ コ ソ ト ノ ホ モ ヨ ロ ヲ ゴ ゾ ド ボ ポ| => 'オ' | |
} | |
SMALL_MAP = { | |
%w|ァ ャ ヮ| => 'ア', | |
%w|ィ | => 'イ', | |
%w|ゥ ュ | => 'ウ', | |
%w|ェ | => 'エ', | |
%w|ォ ョ | => 'オ' | |
} | |
LARGE_STR = LARGE_MAP.keys.flatten.join('') | |
SMALL_STR = SMALL_MAP.keys.flatten.join('') | |
HELP_STR = %w|ッ ー ン|.join('') | |
def self.split(text) | |
text.scan /[#{LARGE_STR}][#{SMALL_STR}#{HELP_STR}]*/u | |
end | |
def self.only_kana?(text) | |
text.match /^[#{LARGE_STR}#{SMALL_STR}#{HELP_STR}]+$/u | |
end | |
def self.to_vowel(sound) | |
result = to_vowel_with_help(sound) | |
result.gsub(/[#{HELP_STR}]/u, '') | |
end | |
def self.to_vowel_with_help(sound) | |
regexp = /([#{LARGE_STR}])([#{SMALL_STR}]?)/u | |
sound.sub(regexp) do | |
map = $2 == '' ? LARGE_MAP : SMALL_MAP | |
snd = $2 == '' ? $1 : $2 | |
map.each {|l, c| break c if l.include?(snd) } | |
end | |
end | |
end | |
main(ARGV) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment