tily · August 1, 2011 03:53
diff --git a/extract_rhyme.rb b/extract_rhyme.rb
 #!/usr/bin/env ruby

 # Usage: ruby extract_rhyme.rb [-m (vowel|vowel_with_help|surface)] -n N /path/to/file.txt
 # Example: ruby extract_rhyme.rb -m vowel -n 3 坊ちゃん.txt

 %w|optparse MeCab|.each{|x| require x}

 def main(args)
 	opts, args = parse_args(args)
 	rhyme = {}

 	File.open(args[0]).each do |line|
 		node_list = get_node_list(line)
 		node_list.each do |node|
 			yomi = node[:feature].split(',').last
 			if yomi == '*' && JaSound.only_kana?(node[:surface])
 				yomi = node[:surface]
 			end

 			sound_list = JaSound.split(yomi)
 			if opts[:mode] == 'vowel'
 				sound_list = sound_list.map {|s| JaSound.to_vowel(s) }
 			elsif opts[:mode] == 'vowel_with_help'
 				sound_list = sound_list.map {|s| JaSound.to_vowel_with_help(s) }
 			end

 			if sound_list.size >= opts[:num]
 				tail = sound_list[-opts[:num], opts[:num]].join('')
 				rhyme[tail] ||= []
 				if !rhyme[tail].include?(node[:surface])
 					rhyme[tail].push(node[:surface])
 				end
 			end
 		end
 	end

 	rhyme.keys.sort.each do |k|
 		if rhyme[k].size > 2
 			puts "#{k}: #{rhyme[k].join(' | ')}"
 		end
 	end
 end

 def parse_args(args)
 	opts = {}
 	OptionParser.new do |opt|
 		opt.on("-m MODE", String ) {|v| opts[:mode] = v }
 		opt.on("-n NUM" , Integer) {|v| opts[:num]  = v }
 		opt.parse!(args)
 	end
 	[opts, args]
 end

 def get_node_list(text)
 	list = []
 	tagger = MeCab::Tagger.new
 	node = tagger.parseToNode(text)
 	while node = node.next
 		list << {:surface => node.surface, :feature => node.feature}
 	end
 	list
 end

 class JaSound
 	LARGE_MAP = {
 		%w|ア カ サ タ ナ ハ マ ヤ ラ ワ ガ ザ ダ バ パ| => 'ア',
 		%w|イ キ シ チ ニ ヒ ミ    リ ヰ ギ ジ ヂ ビ ピ| => 'イ',
 		%w|ウ ク ス ツ ヌ フ ム ユ ル ヴ グ ズ ヅ ブ プ| => 'ウ',
 		%w|エ ケ セ テ ネ ヘ メ    レ ヱ ゲ ゼ デ ベ ペ| => 'エ',
 		%w|オ コ ソ ト ノ ホ モ ヨ ロ ヲ ゴ ゾ ド ボ ポ| => 'オ'
 	}
 	SMALL_MAP = {
 		%w|ァ ャ ヮ| => 'ア',
 		%w|ィ      | => 'イ',
 		%w|ゥ ュ   | => 'ウ',
 		%w|ェ      | => 'エ',
 		%w|ォ ョ   | => 'オ'
 	}
 	LARGE_STR = LARGE_MAP.keys.flatten.join('')
 	SMALL_STR = SMALL_MAP.keys.flatten.join('')
 	HELP_STR = %w|ッ ー ン|.join('')

 	def self.split(text)
 		text.scan /[#{LARGE_STR}][#{SMALL_STR}#{HELP_STR}]*/u
 	end

 	def self.only_kana?(text)
 		text.match /^[#{LARGE_STR}#{SMALL_STR}#{HELP_STR}]+$/u
 	end

 	def self.to_vowel(sound)
 		result = to_vowel_with_help(sound)
 		result.gsub(/[#{HELP_STR}]/u, '')
 	end

 	def self.to_vowel_with_help(sound)
 		regexp = /([#{LARGE_STR}])([#{SMALL_STR}]?)/u
 		sound.sub(regexp) do
 			map = $2 == '' ? LARGE_MAP : SMALL_MAP
 			snd = $2 == '' ? $1 : $2
 			map.each {|l, c| break c if l.include?(snd) }
 		end
 	end
 end

 main(ARGV)
	#!/usr/bin/env ruby

	# Usage: ruby extract_rhyme.rb [-m (vowel\|vowel_with_help\|surface)] -n N /path/to/file.txt
	# Example: ruby extract_rhyme.rb -m vowel -n 3 坊ちゃん.txt

	%w\|optparse MeCab\|.each{\|x\| require x}

	def main(args)
	opts, args = parse_args(args)
	rhyme = {}

	File.open(args[0]).each do \|line\|
	node_list = get_node_list(line)
	node_list.each do \|node\|
	yomi = node[:feature].split(',').last
	if yomi == '*' && JaSound.only_kana?(node[:surface])
	yomi = node[:surface]
	end

	sound_list = JaSound.split(yomi)
	if opts[:mode] == 'vowel'
	sound_list = sound_list.map {\|s\| JaSound.to_vowel(s) }
	elsif opts[:mode] == 'vowel_with_help'
	sound_list = sound_list.map {\|s\| JaSound.to_vowel_with_help(s) }
	end

	if sound_list.size >= opts[:num]
	tail = sound_list[-opts[:num], opts[:num]].join('')
	rhyme[tail] \|\|= []
	if !rhyme[tail].include?(node[:surface])
	rhyme[tail].push(node[:surface])
	end
	end
	end
	end

	rhyme.keys.sort.each do \|k\|
	if rhyme[k].size > 2
	puts "#{k}: #{rhyme[k].join(' \| ')}"
	end
	end
	end

	def parse_args(args)
	opts = {}
	OptionParser.new do \|opt\|
	opt.on("-m MODE", String ) {\|v\| opts[:mode] = v }
	opt.on("-n NUM" , Integer) {\|v\| opts[:num] = v }
	opt.parse!(args)
	end
	[opts, args]
	end

	def get_node_list(text)
	list = []
	tagger = MeCab::Tagger.new
	node = tagger.parseToNode(text)
	while node = node.next
	list << {:surface => node.surface, :feature => node.feature}
	end
	list
	end

	class JaSound
	LARGE_MAP = {
	%w\|アカサタナハマヤラワガザダバパ\| => 'ア',
	%w\|イキシチニヒミリヰギジヂビピ\| => 'イ',
	%w\|ウクスツヌフムユルヴグズヅブプ\| => 'ウ',
	%w\|エケセテネヘメレヱゲゼデベペ\| => 'エ',
	%w\|オコソトノホモヨロヲゴゾドボポ\| => 'オ'
	}
	SMALL_MAP = {
	%w\|ァャヮ\| => 'ア',
	%w\|ィ \| => 'イ',
	%w\|ゥュ \| => 'ウ',
	%w\|ェ \| => 'エ',
	%w\|ォョ \| => 'オ'
	}
	LARGE_STR = LARGE_MAP.keys.flatten.join('')
	SMALL_STR = SMALL_MAP.keys.flatten.join('')
	HELP_STR = %w\|ッーン\|.join('')

	def self.split(text)
	text.scan /[#{LARGE_STR}][#{SMALL_STR}#{HELP_STR}]*/u
	end

	def self.only_kana?(text)
	text.match /^[#{LARGE_STR}#{SMALL_STR}#{HELP_STR}]+$/u
	end

	def self.to_vowel(sound)
	result = to_vowel_with_help(sound)
	result.gsub(/[#{HELP_STR}]/u, '')
	end

	def self.to_vowel_with_help(sound)
	regexp = /([#{LARGE_STR}])([#{SMALL_STR}]?)/u
	sound.sub(regexp) do
	map = $2 == '' ? LARGE_MAP : SMALL_MAP
	snd = $2 == '' ? $1 : $2
	map.each {\|l, c\| break c if l.include?(snd) }
	end
	end
	end

	main(ARGV)