Last active
August 29, 2015 13:56
-
-
Save amardaxini/8971653 to your computer and use it in GitHub Desktop.
CNN news story parser
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'rubygems' | |
require 'net/http' | |
require 'nokogiri' | |
require 'pry' | |
require 'open-uri' | |
module CnnScrapper | |
class CnnParser | |
URL = "http://edition.cnn.com" | |
attr_accessor :cnn_doc,:news_stories,:news_urls | |
def initialize | |
@news_stories = [] | |
@news_urls = [] | |
end | |
def parse_it | |
@cnn_doc = Nokogiri::HTML(open(URL)) | |
get_news_urls | |
end | |
def get_news_urls | |
news_lists = @cnn_doc.css("#cnn_maintt2bul").search('div[data-vr-zone="t3"] ul li') | |
news_lists.each do |news_list| | |
news_url = news_list.search("a[@href]")[0].attributes["href"].value.to_s | |
news_text = news_list.search("a[@href]")[0].text | |
begin | |
if(URI.parse(news_url).host == nil) | |
unless(news_url =~/^\/video/) | |
@news_urls << {:url=>URL+news_url,:short_title=> news_text} | |
end | |
end | |
rescue | |
end | |
end | |
end | |
def get_news_stories | |
@news_urls.each do |news_url| | |
news_story = NewsStory.new(news_url[:url],news_url[:short_title]) | |
news_story.parse_it | |
@news_stories << news_story | |
end | |
end | |
end | |
class NewsStory | |
attr_accessor :cnn_news_url,:title,:author,:news_time,:story_line,:paragraph,:short_title,:html_content,:content | |
def initialize(cnn_news_url,short_title=nil) | |
@cnn_news_url =cnn_news_url | |
@short_title = short_title | |
end | |
def parse_it | |
news_url_doc = Nokogiri::HTML(open(@cnn_news_url)) | |
container = news_url_doc.css("#cnnContentContainer") | |
@title = container.search("h1")[0].text rescue nil | |
@author = container.search(".cnn_stryathrtmp .cnnByline").text rescue nil | |
@news_time = container.search(".cnn_stryathrtmp .cnn_strytmstmp").text rescue nil | |
@story_line = container.search("p")[0].text | |
@paragraph = container.search("p.cnn_storypgraph2").text | |
binding.pry | |
@html_content = container.search("p").to_s | |
@content = container.search("p").text | |
end | |
end | |
end | |
# doc = Nokogiri::HTML(open(url)) | |
# news_urls = [] | |
# news_lists = doc.css("#cnn_maintt2bul").search('div[data-vr-zone="t3"] ul li') | |
# news_lists.each do |news_list| | |
# news_url = news_list.search("a[@href]")[0].attributes["href"].value.to_s | |
# begin | |
# if(URI.parse(news_url).host == nil) | |
# unless(news_url =~/^\/video/) | |
# news_urls << "http://edition.cnn.com"+news_url | |
# end | |
# end | |
# rescue | |
# end | |
# end | |
# news_urls.each do |news_url| | |
# news_url_doc = Nokogiri::HTML(open(news_url)) | |
# container = news_url_doc.css("#cnnContentContainer") | |
# title = container.search("h1")[0].text rescue nil | |
# author = container.search(".cnn_stryathrtmp .cnnByline").text rescue nil | |
# news_time = container.search(".cnn_stryathrtmp .cnn_strytmstmp").text rescue nil | |
# story_line = container.search("p")[0].text | |
# paragraph = container.search("p.cnn_storypgraph2").text | |
# end | |
# binding.pry | |
# cnn = CnnScrapper::CnnParser.new |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment