Created
April 20, 2015 14:47
-
-
Save dgutov/addc25762453c5a205ea to your computer and use it in GitHub Desktop.
Tabs or Spaces
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
source :rubygems | |
group :development do | |
gem 'pry' | |
end | |
gem 'yajl-ruby' | |
gem 'em-synchrony' | |
gem 'em-http-request' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
GEM | |
remote: http://rubygems.org/ | |
specs: | |
addressable (2.3.7) | |
coderay (1.1.0) | |
cookiejar (0.3.2) | |
em-http-request (1.1.2) | |
addressable (>= 2.3.4) | |
cookiejar | |
em-socksify (>= 0.3) | |
eventmachine (>= 1.0.3) | |
http_parser.rb (>= 0.6.0) | |
em-socksify (0.3.0) | |
eventmachine (>= 1.0.0.beta.4) | |
em-synchrony (1.0.4) | |
eventmachine (>= 1.0.0.beta.1) | |
eventmachine (1.0.7) | |
http_parser.rb (0.6.0) | |
method_source (0.8.2) | |
pry (0.10.1) | |
coderay (~> 1.1.0) | |
method_source (~> 0.8.1) | |
slop (~> 3.4) | |
slop (3.6.0) | |
yajl-ruby (1.2.1) | |
PLATFORMS | |
ruby | |
DEPENDENCIES | |
em-http-request | |
em-synchrony | |
pry | |
yajl-ruby |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'open-uri' | |
require 'zlib' | |
require 'yajl' | |
require 'pry' | |
require 'ostruct' | |
require 'em-synchrony' | |
require 'em-synchrony/em-http' | |
require 'em-synchrony/fiber_iterator' | |
$results = Hash[%w(c cpp js java el).map { |ext| [ext, {tabs: 0, spaces: 0}]}] | |
$ext_re = /\.(#{$results.keys.join("|")})\z/ | |
def process_lines(path, lines, repo_name, sha) | |
return unless path && lines.any? | |
return unless ext = path[$ext_re, 1] | |
spaces = 0 | |
tabs = 0 | |
lines.each do |l| | |
case l | |
when /^ {8}/ | |
spaces += 1 | |
when /^\t/ | |
tabs += 1 | |
end | |
end | |
puts "https://raw.githubusercontent.com/#{repo_name}/#{sha}/#{path}: spaces #{spaces}, tabs #{tabs}" | |
lang_results = $results[ext] | |
lang_results[:spaces] += spaces | |
lang_results[:tabs] += tabs | |
end | |
def output_progress(current, total) | |
size = 60 | |
ss = (current * size / total) | |
print "[#{'=' * ss}#{' ' * (size - ss)}] #{current}/#{total}\n" | |
end | |
def process_file(name) | |
puts "Processing #{name}..." | |
gz = open(name) | |
js = Zlib::GzipReader.new(gz).read | |
diffs = [] | |
Yajl::Parser.parse(js) do |event| | |
next unless event["type"] == "PushEvent" | |
shas = event["payload"]["commits"].map { |c| c["sha"] } | |
repo_name = event["repo"]["name"] | |
shas.each { |sha| diffs << OpenStruct.new(repo_name: repo_name, sha: sha)} | |
end | |
counter = 0 | |
output_progress(counter, diffs.size) | |
EM::Synchrony::FiberIterator.new(diffs, 30).each do |diff| | |
repo_url = "https://github.com/#{diff.repo_name}" | |
sha = diff.sha | |
url = "#{repo_url}/commit/#{sha}.diff" | |
begin | |
start = Time.now.to_f | |
http = EM::HttpRequest.new(url).get | |
text = http.response.to_s | |
print "read #{url} (#{Time.now.to_f - start} s)\n" | |
counter += 1 | |
output_progress(counter, diffs.size) | |
path = nil | |
added_lines = [] | |
text.each_line do |l| | |
case l | |
when /^\+\+\+ b\/(.*)/ | |
match = $~[1] | |
process_lines(path, added_lines, diff.repo_name, sha) | |
path = match | |
added_lines = [] | |
when /^\+/ | |
added_lines << l[1..-1] | |
end | |
end | |
process_lines(path, added_lines, diff.repo_name, sha) | |
rescue OpenURI::HTTPError => e | |
print "#{e.message}\n" | |
rescue ArgumentError => e | |
if e.message =~ /invalid byte sequence/ | |
puts "Invalid byte sequence in UTF-8, skipping..." | |
else | |
raise | |
end | |
rescue => e | |
binding.pry | |
end | |
end | |
ensure | |
puts "Intermediate results:" | |
puts $results | |
end | |
EM.synchrony do | |
("01".."30").each do |day| | |
(0..23).each do |hour| | |
process_file("archives/2015-01-#{day}-#{hour}.json.gz") | |
end | |
end | |
EM.stop | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment