-
-
Save michaeldv/193122 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env ruby | |
Main { | |
Home = File.expand_path(ENV["HOME"] || ENV["USERPROFILE"] || "~") | |
Basedir = File.join(Home, "mp3") | |
Threads = 8 | |
description <<-txt | |
mp3scrape will scour any url for it's mp3 content - the script mirrors, | |
never downloading the same file twice. it does not, however, crawl a | |
website for links, it simple scapes all the songs from a single page. | |
txt | |
usage['INSTALL'] = 'sudo gem install mp3scrape' | |
usage['URI'] = 'http://codeforpeople.com' | |
example <<-txt | |
1) get a bunch of xmas tunes | |
mp3scrape http://fuelfriends.blogspot.com/2007/12/christmas-mixery.html | |
2) get a bunch of tunes | |
mp2scrape http://troubledsoulsunite.blogspot.com/ | |
txt | |
argument("uri"){ | |
description "the uri to scrape" | |
cast :uri | |
} | |
option("pattern", "p"){ | |
description "specifiy the mp3 pattern" | |
argument_required | |
default %|['"](http://[^\\s]+[^/\\s]+.mp3)["']| | |
} | |
option("basedir", "b"){ | |
description "specifiy the base download dir - default(#{ Basedir })" | |
argument_required | |
default Basedir | |
} | |
option("destination", "d"){ | |
description "specifiy the absolute download dir - default(#{ File.join Basedir, 'auto-based-on-uri' })" | |
argument_required | |
} | |
option("list"){ | |
description "only list the mp3s that would be scraped" | |
} | |
option("threads", "t"){ | |
description "specify the number of threads to download with in parallel - default(#{ Threads })" | |
argument_required | |
default Threads | |
cast :integer | |
} | |
option("noop", "n"){ | |
description "show the downloads that would be performed" | |
} | |
def run | |
uri = param["uri"].value | |
pattern = %r/#{ param["pattern"].value }/ | |
srcs = open(uri.to_s).read.scan(pattern).flatten.compact | |
if param["list"].given? | |
puts srcs | |
exit | |
end | |
dsts = destinations_for srcs, param["destination"].value | |
spec = srcs.zip dsts | |
if param["noop"].given? | |
spec.each{|src, dst| puts "#{ src } -> #{ dst }"} | |
else | |
mirror spec | |
end | |
end | |
def mirror spec | |
spec.threadify(params["threads"].value) do |src, dst| | |
begin | |
FileUtils.mkdir_p(File.dirname(dst)) | |
mtime = File.stat(dst).mtime rescue Time.at(0) | |
open src do |fd| | |
last_modified = fd.last_modified || Time.now | |
unless last_modified > mtime | |
print "#{ src } == #{ dst }" | |
break | |
end | |
data = fd.read and fd.close | |
open(dst, "wb"){|fd| fd.write data} | |
File.utime last_modified, last_modified, dst | |
print "#{ src } -> #{ dst }" | |
end | |
rescue Object => e | |
STDERR.puts "#{ e.message } (#{ e.class })" | |
end | |
end | |
end | |
def destinations_for srcs, destination = nil | |
srcs.map do |src| | |
basename = File.basename src | |
basename = clean basename | |
File.expand_path( | |
if destination | |
File.join destination, basename | |
else | |
uri = URI.parse src.to_s | |
host, paths = uri.host, uri.path.split("/").map{|path| clean path} | |
basename = clean paths.pop | |
[ Basedir, host, paths, basename ].flatten.compact.join(File::SEPARATOR) | |
end | |
) | |
end | |
end | |
def clean basename | |
CGI.unescape(basename.to_s).gsub(%r/[^0-9a-zA-Z_@)(~.-]/, '_').gsub(%r/_+/,'_') | |
end | |
} | |
BEGIN { | |
require "yaml" | |
require "uri" | |
require "open-uri" | |
require "fileutils" | |
require "cgi" | |
begin | |
require "rubygems" | |
rescue LoadError | |
42 | |
end | |
begin | |
require "main" | |
rescue LoadError | |
STDERR.puts "gem install main" | |
exit 1 | |
end | |
begin | |
require "threadify" | |
rescue LoadError | |
STDERR.puts "gem install threadify" | |
exit 1 | |
end | |
STDERR.sync = STDOUT.sync = true | |
trap("INT"){ exit! } | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment