Created
January 29, 2009 01:07
-
-
Save ahoward/54299 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env ruby | |
Main { | |
description <<-txt | |
pimp3 is easily the greatest program ever written. if you can't figure | |
out what it does hook up some electrodes to your genitals and crank it | |
up fuckers! | |
txt | |
author 'ara.t.howard @ gmail.com' | |
example <<-txt | |
. get the lastest batch of tunes posted to twitter | |
prompt ~> pimp3 | |
. same, with more fanfare | |
prompt ~> pimp3 --verbose | |
txt | |
option("uri"){ | |
description "the twitter search uri" | |
cast :uri | |
default 'http://search.twitter.com/search' | |
} | |
option("pages"){ | |
description "specify the the number of search pages to scrape default(#{ Default.Pages })" | |
argument_required | |
default Default.Pages | |
cast :integer | |
} | |
option("hugeuri"){ | |
description "the twitter huge-ifying uri" | |
cast :uri | |
default 'http://search.twitter.com/hugeurl' | |
} | |
option("pattern"){ | |
description "specifiy the mp3 pattern" | |
argument_required | |
default %|(http://[^\\s]+[^/\\s]+[.]mp3)| | |
} | |
option("basedir", "b"){ | |
description "specifiy the base download dir - default(#{ Default.Basedir })" | |
argument_required | |
default Default.Basedir | |
} | |
option("destination", "d"){ | |
description "specifiy the absolute download dir - default(#{ File.join Default.Basedir, 'auto-based-on-uri' })" | |
argument_required | |
} | |
option("list"){ | |
description "only list the mp3s that would be scraped" | |
} | |
option("threads"){ | |
description "specify the number of threads to download with in parallel - default(#{ Default.Threads })" | |
argument_required | |
default Default.Threads | |
cast :integer | |
} | |
option("timeout"){ | |
description "specify the network timeout for some operations - default(#{ Default.Timeout })" | |
argument_required | |
default Default.Timeout | |
cast :integer | |
} | |
option("noop", "n"){ | |
description "show the downloads that would be performed" | |
} | |
option("verbose", "v"){ | |
description "turn on verbose logging" | |
} | |
def run | |
logger.level = Logger::DEBUG if param['verbose'].given? | |
@uri = param["uri"].value | |
@threads = params['threads'].value | |
@pattern = Regexp.new params['pattern'].value, Regexp::IGNORECASE | |
@hugeuri = param['hugeuri'].value | |
@timeout = param['timeout'].value | |
@destination = param['destination'].value | |
@pages = param['pages'].value | |
@basedir = param['basedir'].value | |
srcs = mp3_uris(expand_uris(extract_uris(parallel_curl(pages_for(@uri))))) | |
(puts srcs; exit) if param["list"].given? | |
dsts = destinations_for srcs, param["destination"].value | |
spec = srcs.zip dsts | |
if param["noop"].given? | |
spec.each{|src, dst| puts "#{ src } -> #{ dst }"} | |
else | |
mirror spec | |
end | |
end | |
def pages_for uri, &block | |
page = 1 | |
Array.new(@pages).map do | |
uri = URI.parse uri.to_s | |
uri.query = query_for(uri.query, :q => '#mp3', :page => page+=1) | |
uri | |
end | |
end | |
Curl = {} | |
def curl uri | |
uri = uri.to_s | |
Timeout.timeout(@timeout) do | |
debug{ "curl(#{ uri })" } | |
Curl[uri] ||= open(uri){|f| f.read.strip} | |
end | |
rescue OpenURI::HTTPError, Timeout::Error, Errno::ENOENT => e | |
debug{ "#{ e.message } (#{ e.class })" } | |
Curl[uri] ||= Empty | |
end | |
Blacklist = | |
%r|^http://www.w3.org|, | |
%r|^http://[^.]*\.?twitter\.com|, | |
%r|^http://s3.amazonaws.com/twitter_production|, | |
Regexp.union('png', 'gif', 'tiff', 'jpg') | |
def Blacklist.ed? uri | |
any?{|re| re.match(uri.to_s)} | |
end | |
def extract_uris *strings | |
log '*** EXTRACTING URIS ***' | |
string = list_for(strings).join("\n") | |
protocols = %w[ http ftp ] | |
results = uri_list_for(protocols.map{|protocol| URI::extract(string, protocol) }) | |
ensure | |
results.each{|uri| log " - #{ uri }"} | |
end | |
def expand_uris *uris | |
log '*** EXPANDING URIS ***' | |
results = | |
uri_list_for( | |
list_for(uris).threadify(@threads) do | |
list_for(uris).map do |uri| | |
uri = uri.to_s | |
unless Blacklist.ed?(uri) | |
uri = URI.parse uri.to_s | |
hugeuri = URI.parse @hugeuri.to_s | |
hugeuri.query = query_for(:url => uri) | |
result = curl(hugeuri) | |
result = uri.to_s if result.empty? | |
result | |
else | |
debug{ "blacklisted : #{ uri }" } | |
Empty | |
end | |
end | |
end | |
) | |
ensure | |
results.each{|uri| log " - #{ uri }"} | |
end | |
def query_for *args | |
kvs = {} | |
list_for(args).each do |arg| | |
arg = CGI.parse(arg.to_s) unless Hash === arg | |
arg.each{|k,v| kvs.update k.to_s => v} | |
end | |
list_for( | |
kvs.map do |k,vs| | |
list_for(vs).map do |v| | |
"#{ k }=#{ CGI.escape(v.to_s) }" | |
end | |
end | |
).join('&') | |
end | |
def parallel_curl *uris | |
list_for( | |
list_for(uris).threadify(@threads) do |uri| | |
begin | |
curl(uri.to_s) | |
rescue OpenURI::HTTPError | |
nil | |
rescue Object => e | |
warn "#{ e.message } (#{ e.class })" | |
nil | |
end | |
end | |
) | |
end | |
def mp3_uris *uris | |
log "*** FILTERING MP3 LINKS ***" | |
results = | |
uri_list_for( | |
list_for(uris).threadify(@threads) do |uri| | |
uri.to_s =~ @pattern ? uri : curl(uri).scan(@pattern) | |
end | |
) | |
ensure | |
results.each{|uri| log " - #{ uri }"} | |
end | |
def mirror spec | |
log "*** MIRRORING MP3 LINKS ***" | |
spec.threadify(@threads) do |src, dst| | |
begin | |
FileUtils.mkdir_p(File.dirname(dst)) | |
mtime = File.stat(dst).mtime rescue Time.at(0) | |
last_modified = last_modified_for(src) | |
unless last_modified > mtime | |
log " - #{ src } == #{ dst }" | |
break | |
end | |
open src do |fd| | |
data = fd.read and fd.close | |
open(dst, "wb"){|fd| fd.write data} | |
File.utime last_modified, last_modified, dst | |
log " - #{ src } -> #{ dst }" | |
end | |
rescue Object => e | |
log " - #{ src } ! #{ e.message } (#{ e.class })" | |
end | |
end | |
end | |
def last_modified_for uri | |
now = Time.now | |
uri = URI.parse uri.to_s | |
host = uri.host | |
port = uri.port | |
path = uri.path | |
Net::HTTP.start(host, port) {|http| | |
response = http.head(path) | |
value = response['last-modified'] | |
value ? Time.httpdate(value) : now | |
} | |
rescue Object | |
now | |
end | |
def destinations_for srcs, destination = nil | |
srcs.map do |src| | |
basename = File.basename src | |
basename = clean basename | |
File.expand_path( | |
if destination | |
File.join destination, basename | |
else | |
uri = URI.parse src.to_s | |
host, paths = uri.host, uri.path.split("/").map{|path| clean path} | |
basename = clean paths.pop | |
[ @basedir, host, paths, basename ].flatten.compact.join(File::SEPARATOR) | |
end | |
) | |
end | |
end | |
def clean basename | |
CGI.unescape(basename.to_s).gsub(%r/[^0-9a-zA-Z_@)(~.-]/, '_').gsub(%r/_+/,'_') | |
end | |
def list_for *args | |
args.flatten.compact.select{|arg| not arg.to_s.strip.empty?} | |
end | |
def uri_list_for *args | |
args = args.flatten.compact.select{|arg| not arg.to_s.strip.empty?} | |
args.uniq! | |
args.delete_if{|uri| uri.empty? or Blacklist.ed?(uri)} | |
args.sort | |
end | |
def uri_for arg | |
URI === arg ? arg : URI.parse(arg.to_s) | |
end | |
def log *messages | |
list_for(messages).each{|message| logger << message.to_s.chomp; logger << "\n"} | |
end | |
} | |
BEGIN { | |
require "yaml" | |
require "timeout" | |
require "uri" | |
require "open-uri" | |
require "fileutils" | |
require "cgi" | |
require 'net/http' | |
require 'time' | |
require 'ostruct' | |
begin | |
require "rubygems" | |
rescue LoadError | |
42 | |
end | |
begin | |
require "main" | |
rescue LoadError | |
STDERR.puts "gem install main" | |
exit 1 | |
end | |
begin | |
require "threadify" | |
rescue LoadError | |
STDERR.puts "gem install threadify" | |
exit 1 | |
end | |
STDERR.sync = STDOUT.sync = true | |
trap("INT"){ exit! } | |
This = File.basename(__FILE__) | |
def This.home | |
home = | |
catch :home do | |
["HOME", "USERPROFILE"].each do |key| | |
throw(:home, ENV[key]) if ENV[key] | |
end | |
if ENV["HOMEDRIVE"] and ENV["HOMEPATH"] | |
throw(:home, "#{ ENV['HOMEDRIVE'] }:#{ ENV['HOMEPATH'] }") | |
end | |
File.expand_path("~") rescue(File::ALT_SEPARATOR ? "C:/" : "/") | |
end | |
File.expand_path home | |
end | |
Default = OpenStruct.new | |
Default.Home = This.home | |
Default.Basedir = File.join(Default.Home, "mp3", This) | |
Default.Threads = 8 | |
Default.Timeout = 42 | |
Default.Pages = 4 | |
Empty = String.new.freeze | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment