Skip to content

Instantly share code, notes, and snippets.

@mguterl
Forked from ahoward/gist:54299
Created January 29, 2009 16:01
Show Gist options
  • Save mguterl/54598 to your computer and use it in GitHub Desktop.
Save mguterl/54598 to your computer and use it in GitHub Desktop.
#! /usr/bin/env ruby
Main {
description <<-txt
pimp3 is easily the greatest program ever written. if you can't figure
out what it does hook up some electrodes to your genitals and crank it
up fuckers!
txt
author 'ara.t.howard @ gmail.com'
example <<-txt
. get the lastest batch of tunes posted to twitter
prompt ~> pimp3
. same, with more fanfare
prompt ~> pimp3 --verbose
txt
option("uri"){
description "the twitter search uri"
cast :uri
default 'http://search.twitter.com/search'
}
option("pages"){
description "specify the the number of search pages to scrape default(#{ Default.Pages })"
argument_required
default Default.Pages
cast :integer
}
option("hugeuri"){
description "the twitter huge-ifying uri"
cast :uri
default 'http://search.twitter.com/hugeurl'
}
option("pattern"){
description "specifiy the mp3 pattern"
argument_required
default %|(http://[^\\s]+[^/\\s]+[.]mp3)|
}
option("basedir", "b"){
description "specifiy the base download dir - default(#{ Default.Basedir })"
argument_required
default Default.Basedir
}
option("destination", "d"){
description "specifiy the absolute download dir - default(#{ File.join Default.Basedir, 'auto-based-on-uri' })"
argument_required
}
option("list"){
description "only list the mp3s that would be scraped"
}
option("threads"){
description "specify the number of threads to download with in parallel - default(#{ Default.Threads })"
argument_required
default Default.Threads
cast :integer
}
option("timeout"){
description "specify the network timeout for some operations - default(#{ Default.Timeout })"
argument_required
default Default.Timeout
cast :integer
}
option("noop", "n"){
description "show the downloads that would be performed"
}
option("verbose", "v"){
description "turn on verbose logging"
}
def run
logger.level = Logger::DEBUG if param['verbose'].given?
@uri = param["uri"].value
@threads = params['threads'].value
@pattern = Regexp.new params['pattern'].value, Regexp::IGNORECASE
@hugeuri = param['hugeuri'].value
@timeout = param['timeout'].value
@destination = param['destination'].value
@pages = param['pages'].value
@basedir = param['basedir'].value
srcs = mp3_uris(expand_uris(extract_uris(parallel_curl(pages_for(@uri)))))
(puts srcs; exit) if param["list"].given?
dsts = destinations_for srcs, param["destination"].value
spec = srcs.zip dsts
if param["noop"].given?
spec.each{|src, dst| puts "#{ src } -> #{ dst }"}
else
mirror spec
end
end
def pages_for uri, &block
page = 1
Array.new(@pages).map do
uri = URI.parse uri.to_s
uri.query = query_for(uri.query, :q => '#mp3', :page => page+=1)
uri
end
end
Curl = {}
def curl uri
uri = uri.to_s
Timeout.timeout(@timeout) do
debug{ "curl(#{ uri })" }
Curl[uri] ||= open(uri){|f| f.read.strip}
end
rescue OpenURI::HTTPError, Timeout::Error, Errno::ENOENT => e
debug{ "#{ e.message } (#{ e.class })" }
Curl[uri] ||= Empty
end
Blacklist =
%r|^http://www.w3.org|,
%r|^http://[^.]*\.?twitter\.com|,
%r|^http://s3.amazonaws.com/twitter_production|,
Regexp.union('png', 'gif', 'tiff', 'jpg')
def Blacklist.ed? uri
any?{|re| re.match(uri.to_s)}
end
def extract_uris *strings
log '*** EXTRACTING URIS ***'
string = list_for(strings).join("\n")
protocols = %w[ http ftp ]
results = uri_list_for(protocols.map{|protocol| URI::extract(string, protocol) })
ensure
results.each{|uri| log " - #{ uri }"}
end
def expand_uris *uris
log '*** EXPANDING URIS ***'
results =
uri_list_for(
list_for(uris).threadify(@threads) do
list_for(uris).map do |uri|
uri = uri.to_s
unless Blacklist.ed?(uri)
uri = URI.parse uri.to_s
hugeuri = URI.parse @hugeuri.to_s
hugeuri.query = query_for(:url => uri)
result = curl(hugeuri)
result = uri.to_s if result.empty?
result
else
debug{ "blacklisted : #{ uri }" }
Empty
end
end
end
)
ensure
results.each{|uri| log " - #{ uri }"}
end
def query_for *args
kvs = {}
list_for(args).each do |arg|
arg = CGI.parse(arg.to_s) unless Hash === arg
arg.each{|k,v| kvs.update k.to_s => v}
end
list_for(
kvs.map do |k,vs|
list_for(vs).map do |v|
"#{ k }=#{ CGI.escape(v.to_s) }"
end
end
).join('&')
end
def parallel_curl *uris
list_for(
list_for(uris).threadify(@threads) do |uri|
begin
curl(uri.to_s)
rescue OpenURI::HTTPError
nil
rescue Object => e
warn "#{ e.message } (#{ e.class })"
nil
end
end
)
end
def mp3_uris *uris
log "*** FILTERING MP3 LINKS ***"
results =
uri_list_for(
list_for(uris).threadify(@threads) do |uri|
uri.to_s =~ @pattern ? uri : curl(uri).scan(@pattern)
end
)
ensure
results.each{|uri| log " - #{ uri }"}
end
def mirror spec
log "*** MIRRORING MP3 LINKS ***"
spec.threadify(@threads) do |src, dst|
begin
FileUtils.mkdir_p(File.dirname(dst))
mtime = File.stat(dst).mtime rescue Time.at(0)
last_modified = last_modified_for(src)
unless last_modified > mtime
log " - #{ src } == #{ dst }"
break
end
open src do |fd|
data = fd.read and fd.close
open(dst, "wb"){|fd| fd.write data}
File.utime last_modified, last_modified, dst
log " - #{ src } -> #{ dst }"
end
rescue Object => e
log " - #{ src } ! #{ e.message } (#{ e.class })"
end
end
end
def last_modified_for uri
now = Time.now
uri = URI.parse uri.to_s
host = uri.host
port = uri.port
path = uri.path
Net::HTTP.start(host, port) {|http|
response = http.head(path)
value = response['last-modified']
value ? Time.httpdate(value) : now
}
rescue Object
now
end
def destinations_for srcs, destination = nil
srcs.map do |src|
basename = File.basename src
basename = clean basename
File.expand_path(
if destination
File.join destination, basename
else
uri = URI.parse src.to_s
host, paths = uri.host, uri.path.split("/").map{|path| clean path}
basename = clean paths.pop
[ @basedir, host, paths, basename ].flatten.compact.join(File::SEPARATOR)
end
)
end
end
def clean basename
CGI.unescape(basename.to_s).gsub(%r/[^0-9a-zA-Z_@)(~.-]/, '_').gsub(%r/_+/,'_')
end
def list_for *args
args.flatten.compact.select{|arg| not arg.to_s.strip.empty?}
end
def uri_list_for *args
args = args.flatten.compact.select{|arg| not arg.to_s.strip.empty?}
args.uniq!
args.delete_if{|uri| uri.empty? or Blacklist.ed?(uri)}
args.sort
end
def uri_for arg
URI === arg ? arg : URI.parse(arg.to_s)
end
def log *messages
list_for(messages).each{|message| logger << message.to_s.chomp; logger << "\n"}
end
}
BEGIN {
require "yaml"
require "timeout"
require "uri"
require "open-uri"
require "fileutils"
require "cgi"
require 'net/http'
require 'time'
require 'ostruct'
begin
require "rubygems"
rescue LoadError
42
end
begin
require "main"
rescue LoadError
STDERR.puts "gem install main"
exit 1
end
begin
require "threadify"
rescue LoadError
STDERR.puts "gem install threadify"
exit 1
end
STDERR.sync = STDOUT.sync = true
trap("INT"){ exit! }
This = File.basename(__FILE__)
def This.home
home =
catch :home do
["HOME", "USERPROFILE"].each do |key|
throw(:home, ENV[key]) if ENV[key]
end
if ENV["HOMEDRIVE"] and ENV["HOMEPATH"]
throw(:home, "#{ ENV['HOMEDRIVE'] }:#{ ENV['HOMEPATH'] }")
end
File.expand_path("~") rescue(File::ALT_SEPARATOR ? "C:/" : "/")
end
File.expand_path home
end
Default = OpenStruct.new
Default.Home = This.home
Default.Basedir = File.join(Default.Home, "mp3", This)
Default.Threads = 8
Default.Timeout = 42
Default.Pages = 4
Empty = String.new.freeze
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment