harishbsrinivas · January 4, 2013 05:17
diff --git a/fetch_news.rb b/fetch_news.rb
 #   Copyright 2013                              Harish B Srinivas
 #
 #   Licensed under the Apache License, Version 2.0 (the "License");
 #   you may not use this file except in compliance with the License.
 #   You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 #   Unless required by applicable law or agreed to in writing, software
 #   distributed under the License is distributed on an "AS IS" BASIS,
 #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #   See the License for the specific language governing permissions and
 #   limitations under the License.

 #   This program fetches the individual pages of the Indian news paper
 #   Deccan Herald and fetches all of the (20+) pages to disk. Which can
 #   Then be trasfered to a tablet. I use this to read the news paper on
 #   my daily commute :)
 #   In case of errors see mechanize.log for inforamtion.
 #
 #...With a little modifications this might work for other sites as well.

 #!/usr/bin/env ruby

 require 'date'
 require 'rubygems'
 require 'mechanize'
 require 'logger'

 URL_LOGIN =  "http://deccanheraldepaper.com"
 MAIN_PAGE = "http://deccanheraldepaper.com/svww_left.php"

 agent = Mechanize.new
 agent.follow_meta_refresh = true
 agent.redirect_ok = true
 agent.user_agent = "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.6;en-US;"+
 " rv:1.9.2) Gecko/20100115 Firefox/3.6"
 agent.log = Logger.new("mechanize.log")
 login_page = agent.get(URL_LOGIN)
 #Load login page and then click on the anonymous login link

 main_page =  login_page.link_with(:text => "Click here")
 agent.click(main_page)
 index = agent.get(MAIN_PAGE)

 page_list = index.body.scan(/'201.*'/)

 # need to get the current date and then use the year, month and day
 utc_offset = +6
 cur_date = DateTime.now
 cur_date = cur_date.new_offset(Rational(utc_offset,24))


 if(cur_date.month <= 9)
 mod_month  = "0"+cur_date.month.to_s
 else
 mod_month = cur_date.month.to_s
 end

  
 if(cur_date.day <= 9)
 mod_day  = "0"+cur_date.day.to_s
 else
 mod_day = cur_date.day.to_s
 end


 Dir.chdir("news/deccan") do
   Dir.mkdir(cur_date.year.to_s+mod_month.to_s+mod_day.to_s, 0777)
      Dir.chdir(cur_date.year.to_s+mod_month.to_s+mod_day.to_s) do
      i = 1
      page_list.each do |filename|
       filename.gsub!("'","")
       file_url = URL_LOGIN+"/pdf/"+cur_date.year.to_s+"/"+mod_month.to_s+"/"+
        mod_day.to_s+"/"+filename.to_s+".pdf"
       page = agent.get(file_url)
           
       if(i <= 9)
        j = "0"+i.to_s
       else
       j = i.to_s
       end
       #dump the stream to file
        File.open(j.to_s+".pdf", 'w+b') do |file|
           file << page.body.strip
       end
       i += 1
      end
   end
 end
	# Copyright 2013 Harish B Srinivas
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	# This program fetches the individual pages of the Indian news paper
	# Deccan Herald and fetches all of the (20+) pages to disk. Which can
	# Then be trasfered to a tablet. I use this to read the news paper on
	# my daily commute :)
	# In case of errors see mechanize.log for inforamtion.
	#
	#...With a little modifications this might work for other sites as well.

	#!/usr/bin/env ruby

	require 'date'
	require 'rubygems'
	require 'mechanize'
	require 'logger'

	URL_LOGIN = "http://deccanheraldepaper.com"
	MAIN_PAGE = "http://deccanheraldepaper.com/svww_left.php"

	agent = Mechanize.new
	agent.follow_meta_refresh = true
	agent.redirect_ok = true
	agent.user_agent = "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.6;en-US;"+
	" rv:1.9.2) Gecko/20100115 Firefox/3.6"
	agent.log = Logger.new("mechanize.log")
	login_page = agent.get(URL_LOGIN)
	#Load login page and then click on the anonymous login link

	main_page = login_page.link_with(:text => "Click here")
	agent.click(main_page)
	index = agent.get(MAIN_PAGE)

	page_list = index.body.scan(/'201.*'/)

	# need to get the current date and then use the year, month and day
	utc_offset = +6
	cur_date = DateTime.now
	cur_date = cur_date.new_offset(Rational(utc_offset,24))


	if(cur_date.month <= 9)
	mod_month = "0"+cur_date.month.to_s
	else
	mod_month = cur_date.month.to_s
	end


	if(cur_date.day <= 9)
	mod_day = "0"+cur_date.day.to_s
	else
	mod_day = cur_date.day.to_s
	end


	Dir.chdir("news/deccan") do
	Dir.mkdir(cur_date.year.to_s+mod_month.to_s+mod_day.to_s, 0777)
	Dir.chdir(cur_date.year.to_s+mod_month.to_s+mod_day.to_s) do
	i = 1
	page_list.each do \|filename\|
	filename.gsub!("'","")
	file_url = URL_LOGIN+"/pdf/"+cur_date.year.to_s+"/"+mod_month.to_s+"/"+
	mod_day.to_s+"/"+filename.to_s+".pdf"
	page = agent.get(file_url)

	if(i <= 9)
	j = "0"+i.to_s
	else
	j = i.to_s
	end
	#dump the stream to file
	File.open(j.to_s+".pdf", 'w+b') do \|file\|
	file << page.body.strip
	end
	i += 1
	end
	end
	end
No results found