Created
May 6, 2009 00:16
-
-
Save sstephenson/107296 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# A library for downloading issues of Harpers from the archives. | |
# | |
# To use, log in to the archives from your web browser. Then set the | |
# HARPERS_ARCHIVE_COOKIE environment variable to the value of the .harpers.org | |
# "archive" cookie, which you can find by visiting the following URL while on | |
# the Harpers website: | |
# | |
# javascript:prompt("HARPERS_ARCHIVE_COOKIE",document.cookie.match(/(?:;|^)\s*archive=(.+?)(?:;|$)/)[1]) | |
# | |
# Example: | |
# % HARPERS_ARCHIVE_COOKIE=6q626...PFHOF irb -r harpers | |
# >> issue = Harpers::Issue.new(2001, 1) | |
# => #<Harpers::Issue http://harpers.org/archive/2001/01> | |
# >> issue.download! | |
# | |
# Requires Hpricot (gem install hpricot). | |
# | |
require "net/http" | |
require "hpricot" | |
require "fileutils" | |
module Harpers | |
HARPERS_BASE_URL = "http://harpers.org/" | |
ARCHIVE_COOKIE = ENV["HARPERS_ARCHIVE_COOKIE"] | |
SAFARI_USER_AGENT = "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_5; en-us) AppleWebKit/528.1 (KHTML, like Gecko) Version/4.0 Safari/528.1" | |
class Record | |
class << self | |
def perform_random_delay | |
return unless @max_delay | |
delay = rand(@max_delay) | |
puts "pausing for #{delay} seconds" | |
sleep delay | |
end | |
def with_random_delay(max_delay) | |
previous_max_delay, @max_delay = @max_delay, max_delay | |
yield | |
ensure | |
@max_delay = previous_max_delay | |
end | |
end | |
def url(*path) | |
URI.parse(File.join(HARPERS_BASE_URL, *path)) | |
end | |
def inspect | |
"#<#{self.class.name} #{url}>" | |
end | |
protected | |
def contents | |
return @contents if @contents | |
Record.perform_random_delay | |
Net::HTTP.start(url.host, url.port) do |http| | |
http.request_get(url.path, headers) do |response| | |
return @contents = response.body | |
end | |
end | |
end | |
def headers | |
{ | |
"User-Agent" => SAFARI_USER_AGENT, | |
"Cookie" => "archive=#{ARCHIVE_COOKIE}" | |
} | |
end | |
end | |
class Issue < Record | |
attr_reader :year, :month | |
def initialize(year, month) | |
@year = year.to_s | |
@month = "%02d" % month | |
end | |
def url | |
super("archive", year, month) | |
end | |
def pages | |
@pages ||= Hpricot(contents).search("div#thumbnails a").map do |link| | |
number = link.attributes["href"][/\/(\d+)$/, 1] | |
Page.new(self, number, link.attributes["title"]) | |
end | |
end | |
def download! | |
Record.with_random_delay(60) do | |
pages.each_with_index do |page, index| | |
page.download! | |
puts "Downloaded page #{index + 1} of #{pages.length}" | |
end | |
end | |
end | |
end | |
class Page < Record | |
attr_reader :issue, :number, :title | |
def initialize(issue, number, title) | |
@issue = issue | |
@number = "%04s" % number | |
@title = title | |
end | |
def url | |
super("media", "pages", issue.year, issue.month, "pdf", "#{number}.pdf") | |
end | |
def path | |
File.join(File.dirname(__FILE__), "archives", issue.year, issue.month, "#{number}.pdf") | |
end | |
def downloaded? | |
File.exists?(path) | |
end | |
def download! | |
return if downloaded? | |
FileUtils.mkdir_p(File.dirname(path)) | |
contents = self.contents | |
File.open(path, "w") { |file| file << contents } | |
end | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment