Created
August 22, 2014 12:51
-
-
Save lasombra/a489f715985715663595 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
# We require... | |
require "httpclient" | |
require "json" | |
# Startup variables | |
url = "http://archive.org/advancedsearch.php" | |
query = {"q" => 'collection:"harvardclassicsbound" AND (collection:harvardclassicsbound)', | |
"output" => "json", | |
"rows" => "60", | |
"fl[]" => "identifier", | |
"save" => "yes", | |
"page" => "1"} | |
headers = {"Accept" => "application/json", "Content-Type" => "application/json"} | |
http = HTTPClient.new | |
http.ssl_config.verify_mode = OpenSSL::SSL::VERIFY_NONE | |
request = http.get(url, query) | |
json_response = JSON.parse(request.content) | |
doc_list = json_response["response"]["docs"] | |
doc_list.each do |doc| | |
# Download URL format: https://archive.org/download/identifier/identifier.epub | |
doc_id = doc["identifier"] | |
new_url = "https://archive.org/download/#{doc_id}/#{doc_id}.epub" | |
puts "Downloading #{new_url}" | |
begin | |
downloaded_file = File.new("#{doc_id}.epub", "wb") | |
downloaded_file.write(http.get_content(new_url)) | |
rescue | |
# Let's try a PDF is the EPUB doesn't exist | |
begin | |
File.delete("#{doc_id}.epub") | |
puts "Failed to download EPUB. Trying the PDF version..." | |
new_url = "https://archive.org/download/#{doc_id}/#{doc_id}.pdf" | |
downloaded_file = File.new("#{doc_id}.pdf", "wb") | |
downloaded_file.write(http.get_content(new_url)) | |
rescue | |
begin | |
# PDF failed, so TXT then | |
File.delete("#{doc_id}.pdf") | |
puts "Failed to download PDF. Trying the TXT version..." | |
new_url = "https://archive.org/download/#{doc_id}/#{doc_id}.txt" | |
downloaded_file = File.new("#{doc_id}.txt", "wb") | |
downloaded_file.write(http.get_content(new_url)) | |
rescue | |
# TXT failed? Try DJVu then | |
File.delete("#{doc_id}.txt") | |
puts "Failed to download TXT. Trying the DjVu version..." | |
new_url = "https://archive.org/download/#{doc_id}/#{doc_id}.djvu" | |
downloaded_file = File.new("#{doc_id}.djvu", "wb") | |
downloaded_file.write(http.get_content(new_url)) | |
end | |
end | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment