Created
June 14, 2010 02:45
-
-
Save mrbrutti/437222 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env ruby | |
| require 'net/http' | |
| require 'cgi' | |
| class Github | |
| GOOGLE = "www.google.com" | |
| GITHUB = "www.github.com" | |
| PORT = 80 | |
| QUERY = "/cse?q=site:github.com++intitle:Profile&hl=en&cof=&num=100&filter=0&safe=off&start=" | |
| def initialize(hits = -1) | |
| @profiles = [] | |
| @emails_list = [] | |
| @start = 0 | |
| @totalhits = hits.to_i | |
| end | |
| attr_accessor :emails_list, :profiles | |
| def search | |
| get GOOGLE, PORT, QUERY+@start.to_s do |res| | |
| parse(res.body) | |
| @start = @start + 100 | |
| crawl(res.body.gsub(/<em>|<\/em>/,"").gsub(/<b>|<\/b>/,"")) unless @totalhits == 0 | |
| sleep(2) and search if @totalhits > @start | |
| end | |
| end | |
| private | |
| def get( url, port, query, &block ) | |
| http = Net::HTTP.new(url,port) | |
| begin | |
| http.start do |http| | |
| request = Net::HTTP::Get.new(query) | |
| response = http.request(request) | |
| case response | |
| when Net::HTTPSuccess,Net::HTTPRedirection then | |
| block.call(response) | |
| else | |
| return response.error! | |
| end | |
| end | |
| rescue Net::HTTPFatalError | |
| puts "Error: Something went wrong with the HTTP request" | |
| rescue Net::HTTPServerException | |
| puts "Error: Something went wrong with the HTTP request" | |
| rescue | |
| puts "Error: Something went wrong :(" + $! | |
| end | |
| end | |
| def get_profile(uri_str, limit = 10, &block) | |
| begin | |
| # You should choose better exception. | |
| raise ArgumentError, 'HTTP redirect too deep' if limit == 0 | |
| response = Net::HTTP.get_response(URI.parse(uri_str)) | |
| case response | |
| when Net::HTTPSuccess then | |
| #puts response.body | |
| block.call(response.body) | |
| when Net::HTTPRedirection then | |
| #puts response['location'] | |
| get_profile(response['location'], limit - 1, &block) | |
| else | |
| response.error! | |
| end | |
| rescue | |
| return nil | |
| end | |
| end | |
| def parse( html ) | |
| #Results <b>1</b> - <b>8</b> of <b>8</b> from <b>www.google.com</b> | |
| hits = html.scan(/<\/b> of [\w\s]*<b>(.*)<\/b> from /) | |
| if hits.empty? or hits == nil | |
| @totalhits = 0 | |
| elsif @totalhits == -1 | |
| @totalhits = hits[0][0].gsub(",","").to_i | |
| else | |
| realhits = hits[0][0].gsub(",","").to_i | |
| @totalhits = realhits if @totalhits > realhits | |
| end | |
| end | |
| def crawl(text) | |
| text.scan(/"http:\/\/github.com\/([\w]*)/).each do |profile| | |
| get_profile "http://#{GITHUB}/#{profile[0]}" do |res| | |
| @profiles << [ profile[0], "http://#{GITHUB}/#{profile[0]}", emails(res) ] | |
| end | |
| end | |
| end | |
| def emails(text) | |
| # Some emails are encoded. decode and replace. | |
| text.gsub!(/eval\(decodeURIComponent\('.*'\)\)/) { |a| CGI.unescape(a) } | |
| # Scan for emails. | |
| list = text.scan(/[a-z0-9!#$&'*+=?^_`{|}~-]+(?:\.[a-z0-9!#$&'*+=?^_`{|}~-]+)*_at_\ | |
| (?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z](?:[a-z-]*[a-z])?|\ | |
| [a-z0-9!#$&'*+=?^_`{|}~-]+(?:\.[a-z0-9!#$&'*+=?^_`{|}~-]+)*\sat\s(?:[a-z0-9](?:[a-z0-9-]\ | |
| *[a-z0-9])?\.)+[a-z](?:[a-z-]*[a-z])?|[a-z0-9!#$&'*+=?^_`{|}~-]+\ | |
| (?:\.[a-z0-9!#$&'*+=?^_`{|}~-]+)*@(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z](?:[a-z-]*[a-z])?|\ | |
| [a-z0-9!#$&'*+=?^_`{|}~-]+(?:\.[a-z0-9!#$&'*+=?^_`{|}~-]+)*\s@\s(?:[a-z0-9](?:[a-z0-9-]*\ | |
| [a-z0-9])?\.)+[a-z](?:[a-z-]*[a-z])?|[a-z0-9!#$&'*+=?^_`{|}~-]+(?:\sdot\s[a-z0-9!#$&'*+=?^_`\ | |
| {|}~-]+)*\sat\s(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\sdot\s)+[a-z](?:[a-z-]*[a-z])??/i) | |
| c_list = fix(list) | |
| c_list.uniq!.each {|e| puts e} | |
| @emails_list.concat(c_list).uniq! | |
| c_list | |
| end | |
| def fix(list) | |
| list.each do |e| | |
| e.gsub!(" at ","@") | |
| e.gsub!("_at_","@") | |
| e.gsub!(" dot ",".") | |
| e.gsub!(/[+0-9]{0,3}[0-9()]{3,5}[-]{0,1}[0-9]{3,4}[-]{0,1}[0-9]{3,5}/,"") | |
| end | |
| end | |
| end | |
| git = Github.new(ARGV[0]) | |
| puts "---------------------------------------------" | |
| puts " Searching #{ARGV[0]} " | |
| puts "---------------------------------------------" | |
| git.search | |
| puts "---------------------------------------------" | |
| puts " Final Results " | |
| puts "---------------------------------------------" | |
| git.emails_list.each { |email| puts email} | |
| if ARGV[1] | |
| puts "---------------------------------------------" | |
| puts " Saving Files into #{ARGV[1]} " | |
| puts "---------------------------------------------" | |
| out = File.new(ARGV[1], "w") | |
| out << "PROFILE, URL, EMAILS\n" | |
| git.profiles.each { |r| out << "#{r[0].upcase},#{r[1]},#{r[2].uniq.map {|x| x + "|"}.to_s[0...-1]}\n"} | |
| out.close | |
| end | |
| puts "---------------------------------------------" | |
| puts " HAPPY HACKING !!! :) " | |
| puts "---------------------------------------------" |
Author
It should with a few changes. I will make some changes to Google parsing and the github parser and it should work in no time.
Great! I was getting crazy to understand why it's not working right now :)
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Is this still working?