Created
June 13, 2009 01:55
-
-
Save soh335/129059 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'MeCab' | |
require 'rubygems' | |
require 'nokogiri' | |
require 'open-uri' | |
url = 'http://www.asahi.com/' | |
text = String.new | |
nokogiri = Nokogiri::HTML.parse(open(url)) | |
li = nokogiri.xpath('//div[@id="HeadLine"]/ul[@class="Lnk FstMod"]/li[1]/a') | |
nokogiri = Nokogiri::HTML.parse(open(url + li[0].attribute('href'))) | |
nokogiri.xpath('//div[@class="BodyTxt"]/*').each do |body| | |
text = text + body.text | |
end | |
text.gsub!(/\n/,'') | |
mecab = MeCab::Tagger.new("-Owakati") | |
data = Array.new | |
mecab.parse(text + "EOS").split(" ").each_cons(3) do |a| | |
data.push h = {'head' => a[0], 'middle' => a[1], 'end' => a[2]} | |
end | |
t1 = data[0]['head'] | |
t2 = data[0]['middle'] | |
new_text = t1 + t2 | |
while true | |
_a = Array.new | |
data.each do |hash| | |
_a.push hash if hash['head'] == t1 && hash['middle'] == t2 | |
end | |
break if _a.size == 0 | |
num = rand(_a.size) | |
new_text = new_text + _a[num]['end'] | |
break if _a[num]['end'] == "EOS" | |
t1 = _a[num]['middle'] | |
t2 = _a[num]['end'] | |
end | |
puts new_text.gsub!(/EOS$/,'') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment