Skip to content

Instantly share code, notes, and snippets.

@mgalgs
Created June 7, 2012 06:09
Show Gist options
  • Save mgalgs/2886830 to your computer and use it in GitHub Desktop.
Save mgalgs/2886830 to your computer and use it in GitHub Desktop.
require 'xmlsimple'
require 'pp'
require 'net/http'
require 'digest/md5'
# require 'open-uri'
baseuri = 'http://www.archive.org/download/MIT_Structure_of_Computer_Programs_1986/'
def get_large_file_rec(uri_str, save_filename, lvl)
prefix = lvl == 0 ? "getting" : ("=" * lvl) + ">"
printf "%s %s\n", prefix, uri_str
req_uri = URI(uri_str)
Net::HTTP.start(req_uri.host) do |http|
http.request_get(uri_str) do |response|
case response
when Net::HTTPSuccess then
puts "Got Net::HTTPSuccess. Saving to " + save_filename
bytes_downloaded = 0
num_chunks = 0
total_bytes = response['content-length'].to_i
printf "total length: %d MB\n", total_bytes / 1000000
open save_filename, 'w' do |io|
response.read_body do |chunk|
io.write chunk
num_chunks++
bytes_downloaded += chunk.length
if num_chunks % 100 == 0
printf " :: Downloaded %d/%d (MB) (%d%%) \r",
(bytes_downloaded / 1000000), (total_bytes / 1000000),
((bytes_downloaded.to_f / total_bytes.to_f) * 100).to_i
end
end # response.read_body
printf "Downloaded %d/%d (MB) (%d%%) \n\n",
(total_bytes / 1000000), (total_bytes / 1000000),
100
end # open
when Net::HTTPRedirection then
get_large_file_rec(response['Location'], save_filename, lvl+1)
else
puts "Got something else!"
end # case response
end # http.request
end # Net::HTTP.start
end # def get_large_file_rec
# driver function for the recursive function
def get_large_file(uri_str, save_filename)
get_large_file_rec(uri_str, save_filename, 0)
end
def digest_matches(filename, digest)
return false if not File.exists? filename
puts "calculating digest for #{filename}"
the_digest = Digest::MD5.hexdigest(File.read(filename))
the_digest == digest
end
## main
config = XmlSimple.xml_in(ARGV[0])
config['file'].each do |el|
next unless el['source'] == 'original'
filename = '../' + el['name']
if digest_matches(filename, el['md5'][0])
puts "digest matches. this file has already been downloaded."
else
get_large_file(baseuri + el['name'], filename)
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment