Skip to content

Instantly share code, notes, and snippets.

@etscrivner
Created March 25, 2010 06:31
Show Gist options
  • Save etscrivner/343257 to your computer and use it in GitHub Desktop.
Save etscrivner/343257 to your computer and use it in GitHub Desktop.
# Scrape Google News
#
# Grab the top stories from google news and print them
require 'rubygems'
require 'hpricot'
require 'open-uri'
class GoogleNews
def initialize
@url = "http://news.google.com"
@hp = Hpricot(open(@url))
end
def top_stories
# Look for potential top story links, sources and descriptions
titles = (@hp/"div[@id='top-stories']//h2.title/a").collect {|title|
title.inner_text.empty? ? nil : title.inner_text.strip
}
titles.compact!
srcs = (@hp/"div[@id='top-stories']//div[@class='sub-title']/span.source").collect {|src|
src.inner_text.strip
}
descs = (@hp/"div[@id='top-stories']//div.body/div.snippet").collect {|desc|
desc.inner_text.strip
}
# Next filter out everything but the story title
result = []
i = 0;
titles.each {|title|
result[i] = {}
result[i]['title'] = title
i += 1
}
i = 0
srcs.each { |src|
result[i]['source'] = src
i += 1
}
i = 0
descs.each {|desc|
result[i]['desc'] = desc
i += 1
}
result
end
end
i = 1
gnews = GoogleNews.new
gnews.top_stories.each do |story|
puts "**** #{i}: ****\n#{story['title']}\n"
print "Source: #{story['source']}\n\n"
print "#{story['desc']}\n\n"
i += 1
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment