Created
July 11, 2010 14:05
-
-
Save mrbrutti/471577 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env ruby | |
| require 'net/http' | |
| require 'cgi' | |
| class Amazon | |
| GOOGLE = "www.google.com" | |
| AMAZON = "www.amazon.com" | |
| PORT = 80 | |
| QUERY = "/cse?q=site:http://www.amazon.com/gp/pdp/profile/&hl=en&cof=&num=100&filter=0&safe=off&start=" | |
| def initialize(hits = -1) | |
| @profiles = [] | |
| @emails_list = [] | |
| @start = 0 | |
| @totalhits = hits.to_i | |
| end | |
| attr_accessor :profiles | |
| def search | |
| get GOOGLE, PORT, [email protected]_s do |res| | |
| parse(res.body) | |
| @start = @start + 100 | |
| crawl_people(res.body.gsub(/<em>|<\/em>/,"").gsub(/<b>|<\/b>/,"")) unless @totalhits == 0 | |
| sleep(2) and search if @totalhits > @start | |
| end | |
| end | |
| private | |
| def get( url, port, query, &block ) | |
| http = Net::HTTP.new(url,port) | |
| begin | |
| http.start do |http| | |
| request = Net::HTTP::Get.new(query) | |
| response = http.request(request) | |
| case response | |
| when Net::HTTPSuccess,Net::HTTPRedirection then | |
| block.call(response) | |
| else | |
| return response.error! | |
| end | |
| end | |
| rescue Net::HTTPFatalError | |
| puts "Error: Something went wrong with the HTTP request" | |
| rescue Net::HTTPServerException | |
| puts "Error: Something went wrong with the HTTP request" | |
| rescue | |
| puts "Error: Something went wrong :(" + $! | |
| end | |
| end | |
| def get_profile(uri_str, limit = 10) | |
| begin | |
| # You should choose better exception. | |
| raise ArgumentError, 'HTTP redirect too deep' if limit == 0 | |
| response = Net::HTTP.get_response(URI.parse(uri_str)) | |
| case response | |
| when Net::HTTPSuccess then | |
| #puts response.body | |
| return response.body | |
| when Net::HTTPRedirection then | |
| #puts response['location'] | |
| get_profile(response['location'], limit - 1) | |
| else | |
| response.error! | |
| end | |
| rescue | |
| return nil | |
| end | |
| end | |
| def parse( html ) | |
| #Results <b>1</b> - <b>8</b> of <b>8</b> from <b>www.google.com</b> | |
| hits = html.scan(/<\/b> of [\w\s]*<b>(.*)<\/b> from /) | |
| if hits.empty? or hits == nil | |
| @totalhits = 0 | |
| elsif @totalhits == -1 | |
| @totalhits = hits[0][0].gsub(",","").to_i | |
| else | |
| realhits = hits[0][0].gsub(",","").to_i | |
| @totalhits = realhits if @totalhits > realhits | |
| end | |
| end | |
| def crawl_people(text) | |
| text.scan(/<a href="([0-9A-Za-z:\\\/?&=@+%.;"'()_-]+)" class=l[\sonmousedown="return clk(this.href,'','','res','\d','')"]*>Amazon.com: Profile for ([\w\s]*)/).each do |profile| | |
| pf = profile[0].to_s | |
| pf = pf.scan(/\/url\?q=([0-9A-Za-z:\\\/?=@+%.;"'()_-]+)&/).to_s if pf.match(/\/url\?q=/) | |
| p = profile[1] | |
| location = get_profile(pf).scan(/Location:<\/b>(.*)<\/div>/).to_s | |
| entity = [ p, pf, location ] | |
| p entity | |
| @profiles << entity | |
| end | |
| end | |
| end | |
| amazon = Amazon.new(ARGV[0].to_i) | |
| trap("INT") do | |
| puts "SEARCH CANCELED ... PRINTING RESULTS ...." | |
| puts "---------------------------------------------" | |
| puts " Temp Results " | |
| puts "---------------------------------------------" | |
| amazon.profiles.each { |p| puts "#{p[0]},#{p[1]},#{p[2]}\n"} | |
| if ARGV[1] | |
| puts "---------------------------------------------" | |
| puts " Saving Files into #{ARGV[1]} " | |
| puts "---------------------------------------------" | |
| out = File.new(ARGV[1], "w") | |
| out << "PROFILE, URL\n" | |
| amazon.profiles.each { |p| out << "#{p[0]},#{p[1]},#{p[2]}\n"} | |
| out.close | |
| end | |
| exit(1) | |
| end | |
| puts "---------------------------------------------" | |
| puts " Searching #{ARGV[0]} " | |
| puts "---------------------------------------------" | |
| amazon.search | |
| puts "---------------------------------------------" | |
| puts " Final Results " | |
| puts "---------------------------------------------" | |
| amazon.profiles.each { |p| out << "#{p[0]},#{p[1]},#{p[2]}\n"} | |
| if ARGV[1] | |
| puts "---------------------------------------------" | |
| puts " Saving Files into #{ARGV[1]} " | |
| puts "---------------------------------------------" | |
| out = File.new(ARGV[1], "w") | |
| out << "PROFILE, URL\n" | |
| amazon.profiles.each { |p| out << "#{p[0]},#{p[1]},#{p[2]}\n"} | |
| out.close | |
| end | |
| puts "---------------------------------------------" | |
| puts " HAPPY HACKING !!! :) " | |
| puts "---------------------------------------------" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment