Created
January 4, 2013 05:17
-
-
Save harishbsrinivas/4450154 to your computer and use it in GitHub Desktop.
A simple Ruby script to scrape PDF files from an Indian newspaper website. www.deccanheraldepaper.com allows download of individual pages of the news paper but there is no way to download all of them at once. The script downloads all the pages to my computer which can then be transferred to my tablet for reading offline during my commute.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Copyright 2013 Harish B Srinivas | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
# This program fetches the individual pages of the Indian news paper | |
# Deccan Herald and fetches all of the (20+) pages to disk. Which can | |
# Then be trasfered to a tablet. I use this to read the news paper on | |
# my daily commute :) | |
# In case of errors see mechanize.log for inforamtion. | |
# | |
#...With a little modifications this might work for other sites as well. | |
#!/usr/bin/env ruby | |
require 'date' | |
require 'rubygems' | |
require 'mechanize' | |
require 'logger' | |
URL_LOGIN = "http://deccanheraldepaper.com" | |
MAIN_PAGE = "http://deccanheraldepaper.com/svww_left.php" | |
agent = Mechanize.new | |
agent.follow_meta_refresh = true | |
agent.redirect_ok = true | |
agent.user_agent = "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.6;en-US;"+ | |
" rv:1.9.2) Gecko/20100115 Firefox/3.6" | |
agent.log = Logger.new("mechanize.log") | |
login_page = agent.get(URL_LOGIN) | |
#Load login page and then click on the anonymous login link | |
main_page = login_page.link_with(:text => "Click here") | |
agent.click(main_page) | |
index = agent.get(MAIN_PAGE) | |
page_list = index.body.scan(/'201.*'/) | |
# need to get the current date and then use the year, month and day | |
utc_offset = +6 | |
cur_date = DateTime.now | |
cur_date = cur_date.new_offset(Rational(utc_offset,24)) | |
if(cur_date.month <= 9) | |
mod_month = "0"+cur_date.month.to_s | |
else | |
mod_month = cur_date.month.to_s | |
end | |
if(cur_date.day <= 9) | |
mod_day = "0"+cur_date.day.to_s | |
else | |
mod_day = cur_date.day.to_s | |
end | |
Dir.chdir("news/deccan") do | |
Dir.mkdir(cur_date.year.to_s+mod_month.to_s+mod_day.to_s, 0777) | |
Dir.chdir(cur_date.year.to_s+mod_month.to_s+mod_day.to_s) do | |
i = 1 | |
page_list.each do |filename| | |
filename.gsub!("'","") | |
file_url = URL_LOGIN+"/pdf/"+cur_date.year.to_s+"/"+mod_month.to_s+"/"+ | |
mod_day.to_s+"/"+filename.to_s+".pdf" | |
page = agent.get(file_url) | |
if(i <= 9) | |
j = "0"+i.to_s | |
else | |
j = i.to_s | |
end | |
#dump the stream to file | |
File.open(j.to_s+".pdf", 'w+b') do |file| | |
file << page.body.strip | |
end | |
i += 1 | |
end | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment