Skip to content

Instantly share code, notes, and snippets.

@mrbrutti
Created July 11, 2010 14:05
Show Gist options
  • Select an option

  • Save mrbrutti/471577 to your computer and use it in GitHub Desktop.

Select an option

Save mrbrutti/471577 to your computer and use it in GitHub Desktop.
#!/usr/bin/env ruby
require 'net/http'
require 'cgi'
class Amazon
GOOGLE = "www.google.com"
AMAZON = "www.amazon.com"
PORT = 80
QUERY = "/cse?q=site:http://www.amazon.com/gp/pdp/profile/&hl=en&cof=&num=100&filter=0&safe=off&start="
def initialize(hits = -1)
@profiles = []
@emails_list = []
@start = 0
@totalhits = hits.to_i
end
attr_accessor :profiles
def search
get GOOGLE, PORT, [email protected]_s do |res|
parse(res.body)
@start = @start + 100
crawl_people(res.body.gsub(/<em>|<\/em>/,"").gsub(/<b>|<\/b>/,"")) unless @totalhits == 0
sleep(2) and search if @totalhits > @start
end
end
private
def get( url, port, query, &block )
http = Net::HTTP.new(url,port)
begin
http.start do |http|
request = Net::HTTP::Get.new(query)
response = http.request(request)
case response
when Net::HTTPSuccess,Net::HTTPRedirection then
block.call(response)
else
return response.error!
end
end
rescue Net::HTTPFatalError
puts "Error: Something went wrong with the HTTP request"
rescue Net::HTTPServerException
puts "Error: Something went wrong with the HTTP request"
rescue
puts "Error: Something went wrong :(" + $!
end
end
def get_profile(uri_str, limit = 10)
begin
# You should choose better exception.
raise ArgumentError, 'HTTP redirect too deep' if limit == 0
response = Net::HTTP.get_response(URI.parse(uri_str))
case response
when Net::HTTPSuccess then
#puts response.body
return response.body
when Net::HTTPRedirection then
#puts response['location']
get_profile(response['location'], limit - 1)
else
response.error!
end
rescue
return nil
end
end
def parse( html )
#Results <b>1</b> - <b>8</b> of <b>8</b> from <b>www.google.com</b>
hits = html.scan(/<\/b> of [\w\s]*<b>(.*)<\/b> from /)
if hits.empty? or hits == nil
@totalhits = 0
elsif @totalhits == -1
@totalhits = hits[0][0].gsub(",","").to_i
else
realhits = hits[0][0].gsub(",","").to_i
@totalhits = realhits if @totalhits > realhits
end
end
def crawl_people(text)
text.scan(/<a href="([0-9A-Za-z:\\\/?&=@+%.;"'()_-]+)" class=l[\sonmousedown="return clk(this.href,'','','res','\d','')"]*>Amazon.com: Profile for ([\w\s]*)/).each do |profile|
pf = profile[0].to_s
pf = pf.scan(/\/url\?q=([0-9A-Za-z:\\\/?=@+%.;"'()_-]+)&amp/).to_s if pf.match(/\/url\?q=/)
p = profile[1]
location = get_profile(pf).scan(/Location:<\/b>(.*)<\/div>/).to_s
entity = [ p, pf, location ]
p entity
@profiles << entity
end
end
end
amazon = Amazon.new(ARGV[0].to_i)
trap("INT") do
puts "SEARCH CANCELED ... PRINTING RESULTS ...."
puts "---------------------------------------------"
puts " Temp Results "
puts "---------------------------------------------"
amazon.profiles.each { |p| puts "#{p[0]},#{p[1]},#{p[2]}\n"}
if ARGV[1]
puts "---------------------------------------------"
puts " Saving Files into #{ARGV[1]} "
puts "---------------------------------------------"
out = File.new(ARGV[1], "w")
out << "PROFILE, URL\n"
amazon.profiles.each { |p| out << "#{p[0]},#{p[1]},#{p[2]}\n"}
out.close
end
exit(1)
end
puts "---------------------------------------------"
puts " Searching #{ARGV[0]} "
puts "---------------------------------------------"
amazon.search
puts "---------------------------------------------"
puts " Final Results "
puts "---------------------------------------------"
amazon.profiles.each { |p| out << "#{p[0]},#{p[1]},#{p[2]}\n"}
if ARGV[1]
puts "---------------------------------------------"
puts " Saving Files into #{ARGV[1]} "
puts "---------------------------------------------"
out = File.new(ARGV[1], "w")
out << "PROFILE, URL\n"
amazon.profiles.each { |p| out << "#{p[0]},#{p[1]},#{p[2]}\n"}
out.close
end
puts "---------------------------------------------"
puts " HAPPY HACKING !!! :) "
puts "---------------------------------------------"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment