-
-
Save Jeff-Russ/5052a85a4656219d98eef52e121e8e3e to your computer and use it in GitHub Desktop.
Ruby script to download a number of files from individual URLs via HTTP/HTTPS/FTP specified in an external file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
# | |
# Ruby script to download many numbered of files | |
# from individual URLs via HTTP/HTTPS/FTP | |
# specified in an external file. | |
# | |
# Author: Jeff Russ | |
# Original Author: Tobias Preuss | |
# License: Creative Commons Attribution-ShareAlike 3.0 Unported | |
require 'net/http' | |
require 'net/ftp' | |
require 'uri' | |
require 'date' | |
if __FILE__ == $0 | |
usage = <<-EOU | |
usage example: | |
$ ./download_numbered_files.rb http://www.domain.com/file**.txt 1 2 | |
The numbers inserted at the wildcards (*) and are padded | |
in accordance to however many wildcards are given. | |
If only one wildcard is given, the numbers are not padded. | |
the above would create a folder at working directory and download to it: | |
http://www.domain.com/file01.txt as file01.txt | |
http://www.domain.com/file02.txt as file02.txt | |
The download folder is called "Downloads" and if it exists "Downloads-1" it tried. | |
If that exists "Downloads-2" is tried and so on. | |
An optional fourth argument flag as seen below will simply download to the current | |
working directory instead. BEWARE: this will overwrite any files! | |
$ ./download_numbered_files.rb http://www.domain.com/file**.txt 1 2 -wd | |
EOU | |
abort usage if ARGV.length < 3 | |
end | |
def http_download_uri(uri, filename) | |
puts "Starting HTTP download for: " + uri.to_s | |
http_object = Net::HTTP.new(uri.host, uri.port) | |
http_object.use_ssl = true if uri.scheme == 'https' | |
begin | |
http_object.start do |http| | |
request = Net::HTTP::Get.new uri.request_uri | |
http.read_timeout = 500 | |
http.request request do |response| | |
open filename, 'w' do |io| | |
response.read_body do |chunk| | |
io.write chunk | |
end | |
end | |
end | |
end | |
rescue Exception => e | |
puts "=> Exception: '#{e}'. Skipping download." | |
return | |
end | |
puts "Stored download as " + filename + "." | |
end | |
def ftp_download_uri(uri, filename) | |
puts "Starting FTP download for: " + uri.to_s + "." | |
dirname = File.dirname(uri.path) | |
basename = File.basename(uri.path) | |
begin | |
Net::FTP.open(uri.host) do |ftp| | |
ftp.login | |
ftp.chdir(dirname) | |
ftp.getbinaryfile(basename) | |
end | |
rescue Exception => e | |
puts "=> Exception: '#{e}'. Skipping download." | |
return | |
end | |
puts "Stored download as " + filename + "." | |
end | |
def download_resource(resource, filename) | |
uri = URI.parse(resource) | |
case uri.scheme.downcase | |
when /http|https/ | |
http_download_uri(uri, filename) | |
when /ftp/ | |
ftp_download_uri(uri, filename) | |
else | |
puts "Unsupported URI scheme for resource " + resource + "." | |
end | |
end | |
def main usage | |
# determine padding | |
uri_template = ARGV[0] | |
padding = uri_template.count "*" | |
if padding == 0 | |
abort "\nThe wildcard '*' was not found in the url argument.\n#{usage}" | |
end | |
# check for errors in wildcard(s) specified | |
wild_first_idx = uri_template.index('*') | |
wild_last_idx = uri_template.rindex('*') | |
wildcards = uri_template[wild_first_idx..wild_last_idx] | |
if wildcards.length != padding | |
abort "\nThe wildcard portion must not contain any other characters.\n#{usage}" | |
end | |
# break provided uri into parts (without wildcard) | |
uri_start = uri_template[0...wild_first_idx] | |
uri_end_first_idx = wild_last_idx + 1 | |
uri_end = uri_template[uri_end_first_idx..-1] | |
# parse range of files specifed by 2nd and 3rd args | |
if ARGV[1].scan(/\D/).empty? then int_start = ARGV[1].to_i | |
else abort "\nYour second argument must be an integer.\n#{usage}" | |
end | |
if ARGV[2].scan(/\D/).empty? then int_end = ARGV[2].to_i | |
else abort "\nYour third argument must be an integer.\n#{usage}" | |
end | |
# insert numbers, creating array of url, filename pairs | |
uris = Array.new | |
for i in int_start..int_end do | |
file_num = i.to_s.rjust(padding, "0") | |
full_uri = "#{uri_start}#{file_num}#{uri_end}" | |
file = full_uri.split('/')[-1] | |
parts = [full_uri, file] | |
pair = Hash[ [:resource, :filename].zip(parts) ] | |
uris.push(pair) | |
end | |
# create/change to target directory | |
if ARGV.length == 3 | |
target_directory = "Downloads" | |
if Dir.exists?(target_directory) | |
i = 1 | |
target_directory = "Downloads-#{i}" | |
while Dir.exists?(target_directory) | |
i += 1 | |
target_directory = "Downloads-#{i}" | |
end | |
end | |
Dir.mkdir(target_directory) | |
Dir.chdir(target_directory) | |
puts "Changed directory: " + Dir.pwd | |
end | |
# download_resources | |
uris.each do |pair| | |
filename = pair[:filename].to_s | |
resource = pair[:resource].to_s | |
unless File.exists?(filename) | |
download_resource(resource, filename) | |
else | |
puts "Skipping download for " + filename + ". It already exists." | |
end | |
end | |
end | |
main usage |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
The original script by @johnjohndoe didn't do what I thought it did so I adapted it to. That is: have a quick way to download a bunch of numbered files from the same web location in an automated way i.e. not having to enter each one. This version takes a URL with a wildcard portion and insert numbers of a given range to download them all.
For more details, execute the file from terminal without any arguments.