Created
April 10, 2016 08:14
-
-
Save chtz/06debf68c2e8127a30243cb489df6c25 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/ruby | |
#$ time java -Xms2000m -Xmx2000m -jar jruby-complete-1.7.24.jar diff.complex.rb 1_to_40000000.csv 1_to_3000000.x.csv | |
| |
#s0.length = 40000000 | |
#s1.length = 3000001 | |
#s1 in s0 = 3000000 | |
#s1 NOT in s0 = 1 | |
#111111111111111111111111111111222222222222222222222222222222333333333333333333333333333333444444444444444444444444444444555555555555555555555555555555666666666666666666666666666666777777777777777777777777777777888888888888888888888888888888999999999999999999999999999999 | |
#40000001 | |
# | |
#real 2m39.630s | |
#user 6m28.787s | |
#sys 0m3.623s | |
require 'set' | |
PROGRESS_BATCH_SIZE = 100000 | |
MAX_S0_LENGTH=4000000 | |
@temp_file_i = 0 | |
def diff(s0, s1_file) | |
in_s0 = 0 | |
count = 0 | |
File.open("temp.#{@temp_file_i}.csv", "w") do |wf| | |
@temp_file_i = @temp_file_i + 1 | |
File.open(ARGV[1]) do |f| | |
f.each_line do |line| | |
unless line.chomp!.length == 0 | |
if s0.include?(line) | |
in_s0 = in_s0 + 1 | |
else | |
wf.write("#{line}\n") | |
end | |
count = count + 1 | |
if count > 0 && count % PROGRESS_BATCH_SIZE == 0 | |
print "1" | |
end | |
end | |
end | |
end | |
end | |
return { :in_s0 => in_s0, :count => count } | |
end | |
in_s0 = 0 | |
s1_length = 0 | |
s0 = Set.new | |
s0_length = 0 | |
count = 0 | |
File.open(ARGV[0]) do |f| | |
f.each_line do |line| | |
unless line.chomp!.length == 0 | |
s0 << line | |
count = count + 1 | |
if count > 0 && count % PROGRESS_BATCH_SIZE == 0 | |
print "0" | |
end | |
if s0.length > MAX_S0_LENGTH | |
result = diff(s0, ARGV[1]) | |
in_s0 = in_s0 + result[:in_s0] | |
s1_length = result[:count] | |
s0_length = s0_length + s0.length | |
s0 = Set.new | |
end | |
end | |
end | |
end | |
if s0.length > 0 | |
result = diff(s0, ARGV[1]) | |
in_s0 = in_s0 + result[:in_s0] | |
s1_length = result[:count] | |
s0_length = s0_length + s0.length | |
end | |
print "\n\n" | |
puts "s0.length = #{s0_length}" | |
puts "s1.length = #{s1_length}" | |
puts "s1 in s0 = #{in_s0}" | |
puts "s1 NOT in s0 = #{s1_length - in_s0}" | |
s1_not_in_s0 = nil | |
count = 0 | |
(0..@temp_file_i-1).each do |temp_file_i| | |
s1_not_in_s0_candidate = Set.new | |
File.open("temp.#{temp_file_i}.csv") do |f| | |
f.each_line do |line| | |
s1_not_in_s0_candidate << line unless line.chomp!.length == 0 | |
count = count + 1 | |
if count > 0 && count % PROGRESS_BATCH_SIZE == 0 | |
print temp_file_i | |
end | |
end | |
end | |
File.delete("temp.#{temp_file_i}.csv") | |
unless s1_not_in_s0 | |
s1_not_in_s0 = s1_not_in_s0_candidate | |
else | |
s1_not_in_s0 = s1_not_in_s0 & s1_not_in_s0_candidate | |
end | |
end | |
print "\n\n" | |
puts s1_not_in_s0.to_a |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/ruby | |
require 'set' | |
s0 = Set.new | |
count = 0 | |
File.open(ARGV[0]) do |f| | |
f.each_line do |line| | |
unless line.chomp!.length == 0 | |
s0 << line | |
count = count + 1 | |
if count > 0 && count % 100000 == 0 | |
print "." | |
end | |
end | |
end | |
end | |
if count >= 100000 | |
print "\n" | |
end | |
puts "s0.length = #{s0.length}" | |
in_s0 = 0 | |
not_in_s0 = [] | |
count = 0 | |
File.open(ARGV[1]) do |f| | |
f.each_line do |line| | |
unless line.chomp!.length == 0 | |
if s0.include?(line) | |
in_s0 = in_s0 + 1 | |
else | |
not_in_s0 << line | |
end | |
count = count + 1 | |
if count > 0 && count % 100000 == 0 | |
print "." | |
end | |
end | |
end | |
end | |
if count >= 100000 | |
print "\n" | |
end | |
puts "s1.length = #{in_s0 + not_in_s0.length}" | |
puts "s1 in s0 = #{in_s0}" | |
puts "s1 NOT in s0 = #{not_in_s0.length}" | |
puts not_in_s0 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
==== SIMPLE ==== | |
$ time java -jar jruby-complete-1.7.24.jar diff.rb 1_to_3000000.csv 1_to_40000000.csv | |
.............................. | |
s0.length = 3000000 | |
................................................................................................................................................................................................................................................................................................................................................................................................................ | |
s1.length = 40000000 | |
s1 in s0 = 3000000 | |
s1 NOT in s0 = 37000000 | |
real 0m54.145s | |
user 1m19.255s | |
sys 0m1.578s | |
$ time ./diff.rb 1_to_3000000.csv 1_to_40000000.csv | |
.............................. | |
s0.length = 3000000 | |
................................................................................................................................................................................................................................................................................................................................................................................................................ | |
s1.length = 40000000 | |
s1 in s0 = 3000000 | |
s1 NOT in s0 = 37000000 | |
real 0m44.482s | |
user 0m43.954s | |
sys 0m0.492s | |
$ time java -jar jruby-complete-1.7.24.jar diff.rb 1_to_40000000.csv 1_to_3000000.csv | |
...............................................................................................................................................................................................................................^C^C^Z | |
$ time java -Xms8000m -Xmx8000m -jar jruby-complete-1.7.24.jar diff.rb 1_to_40000000.csv 1_to_3000000.csv | |
................................................................................................................................................................................................................................................................................................................................................................................................................ | |
s0.length = 40000000 | |
.............................. | |
s1.length = 3000000 | |
s1 in s0 = 3000000 | |
s1 NOT in s0 = 0 | |
real 2m26.038s | |
user 11m17.241s | |
sys 0m7.339s | |
$ time ./diff.rb 1_to_40000000.csv 1_to_3000000.csv | |
................................................................................................................................................................................................................................................................................................................................................................................................................ | |
s0.length = 40000000 | |
.............................. | |
s1.length = 3000000 | |
s1 in s0 = 3000000 | |
s1 NOT in s0 = 0 | |
real 1m35.192s | |
user 1m32.760s | |
sys 0m2.407s | |
==== OPTIMIZED ==== | |
$ time java -jar jruby-complete-1.7.24.jar diff.rb 1_to_40000000.csv 1_to_3000000.csv | |
| |
s0.length = 40000000 | |
s1.length = 3000000 | |
s1 in s0 = 3000000 | |
s1 NOT in s0 = 0 | |
real 1m36.739s | |
user 3m33.497s | |
sys 0m3.297s | |
==== EVEN MORE OPTIMIZED ==== | |
$ time java -Xms2000m -Xmx2000m -jar jruby-complete-1.7.24.jar diff.complex.rb 1_to_40000000.csv 1_to_3000000.x.csv | |
| |
s0.length = 40000000 | |
s1.length = 3000001 | |
s1 in s0 = 3000000 | |
s1 NOT in s0 = 1 | |
111111111111111111111111111111222222222222222222222222222222333333333333333333333333333333444444444444444444444444444444555555555555555555555555555555666666666666666666666666666666777777777777777777777777777777888888888888888888888888888888999999999999999999999999999999 | |
40000001 | |
real 2m39.630s | |
user 6m28.787s | |
sys 0m3.623s | |
$ time ./diff.complex.rb 1_to_40000000.csv 1_to_3000000.x.csv | |
| |
s0.length = 40000000 | |
s1.length = 3000001 | |
s1 in s0 = 3000000 | |
s1 NOT in s0 = 1 | |
111111111111111111111111111111222222222222222222222222222222333333333333333333333333333333444444444444444444444444444444555555555555555555555555555555666666666666666666666666666666777777777777777777777777777777888888888888888888888888888888999999999999999999999999999999 | |
40000001 | |
real 3m22.208s | |
user 3m11.059s | |
sys 0m10.821s |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment