Skip to content

Instantly share code, notes, and snippets.

@chtz
Created April 10, 2016 08:14
Show Gist options
  • Save chtz/06debf68c2e8127a30243cb489df6c25 to your computer and use it in GitHub Desktop.
Save chtz/06debf68c2e8127a30243cb489df6c25 to your computer and use it in GitHub Desktop.
#!/usr/bin/ruby
#$ time java -Xms2000m -Xmx2000m -jar jruby-complete-1.7.24.jar diff.complex.rb 1_to_40000000.csv 1_to_3000000.x.csv
#000000000000000000000000000000000000000011111111111111111111111111111100000000000000000000000000000000000000001111111111111111111111111111110000000000000000000000000000000000000000111111111111111111111111111111000000000000000000000000000000000000000011111111111111111111111111111100000000000000000000#000000000000000000001111111111111111111111111111110000000000000000000000000000000000000000111111111111111111111111111111000000000000000000000000000000000000000011111111111111111111111111111100000000000000000000000000000000000000001111111111111111111111111111110000000000000000000000000000000000000000#1111111111111111111111111111110000000000000000000000000000000000000000111111111111111111111111111111
#s0.length = 40000000
#s1.length = 3000001
#s1 in s0 = 3000000
#s1 NOT in s0 = 1
#111111111111111111111111111111222222222222222222222222222222333333333333333333333333333333444444444444444444444444444444555555555555555555555555555555666666666666666666666666666666777777777777777777777777777777888888888888888888888888888888999999999999999999999999999999
#40000001
#
#real 2m39.630s
#user 6m28.787s
#sys 0m3.623s
require 'set'
PROGRESS_BATCH_SIZE = 100000
MAX_S0_LENGTH=4000000
@temp_file_i = 0
def diff(s0, s1_file)
in_s0 = 0
count = 0
File.open("temp.#{@temp_file_i}.csv", "w") do |wf|
@temp_file_i = @temp_file_i + 1
File.open(ARGV[1]) do |f|
f.each_line do |line|
unless line.chomp!.length == 0
if s0.include?(line)
in_s0 = in_s0 + 1
else
wf.write("#{line}\n")
end
count = count + 1
if count > 0 && count % PROGRESS_BATCH_SIZE == 0
print "1"
end
end
end
end
end
return { :in_s0 => in_s0, :count => count }
end
in_s0 = 0
s1_length = 0
s0 = Set.new
s0_length = 0
count = 0
File.open(ARGV[0]) do |f|
f.each_line do |line|
unless line.chomp!.length == 0
s0 << line
count = count + 1
if count > 0 && count % PROGRESS_BATCH_SIZE == 0
print "0"
end
if s0.length > MAX_S0_LENGTH
result = diff(s0, ARGV[1])
in_s0 = in_s0 + result[:in_s0]
s1_length = result[:count]
s0_length = s0_length + s0.length
s0 = Set.new
end
end
end
end
if s0.length > 0
result = diff(s0, ARGV[1])
in_s0 = in_s0 + result[:in_s0]
s1_length = result[:count]
s0_length = s0_length + s0.length
end
print "\n\n"
puts "s0.length = #{s0_length}"
puts "s1.length = #{s1_length}"
puts "s1 in s0 = #{in_s0}"
puts "s1 NOT in s0 = #{s1_length - in_s0}"
s1_not_in_s0 = nil
count = 0
(0..@temp_file_i-1).each do |temp_file_i|
s1_not_in_s0_candidate = Set.new
File.open("temp.#{temp_file_i}.csv") do |f|
f.each_line do |line|
s1_not_in_s0_candidate << line unless line.chomp!.length == 0
count = count + 1
if count > 0 && count % PROGRESS_BATCH_SIZE == 0
print temp_file_i
end
end
end
File.delete("temp.#{temp_file_i}.csv")
unless s1_not_in_s0
s1_not_in_s0 = s1_not_in_s0_candidate
else
s1_not_in_s0 = s1_not_in_s0 & s1_not_in_s0_candidate
end
end
print "\n\n"
puts s1_not_in_s0.to_a
#!/usr/bin/ruby
require 'set'
s0 = Set.new
count = 0
File.open(ARGV[0]) do |f|
f.each_line do |line|
unless line.chomp!.length == 0
s0 << line
count = count + 1
if count > 0 && count % 100000 == 0
print "."
end
end
end
end
if count >= 100000
print "\n"
end
puts "s0.length = #{s0.length}"
in_s0 = 0
not_in_s0 = []
count = 0
File.open(ARGV[1]) do |f|
f.each_line do |line|
unless line.chomp!.length == 0
if s0.include?(line)
in_s0 = in_s0 + 1
else
not_in_s0 << line
end
count = count + 1
if count > 0 && count % 100000 == 0
print "."
end
end
end
end
if count >= 100000
print "\n"
end
puts "s1.length = #{in_s0 + not_in_s0.length}"
puts "s1 in s0 = #{in_s0}"
puts "s1 NOT in s0 = #{not_in_s0.length}"
puts not_in_s0
==== SIMPLE ====
$ time java -jar jruby-complete-1.7.24.jar diff.rb 1_to_3000000.csv 1_to_40000000.csv
..............................
s0.length = 3000000
................................................................................................................................................................................................................................................................................................................................................................................................................
s1.length = 40000000
s1 in s0 = 3000000
s1 NOT in s0 = 37000000
real 0m54.145s
user 1m19.255s
sys 0m1.578s
$ time ./diff.rb 1_to_3000000.csv 1_to_40000000.csv
..............................
s0.length = 3000000
................................................................................................................................................................................................................................................................................................................................................................................................................
s1.length = 40000000
s1 in s0 = 3000000
s1 NOT in s0 = 37000000
real 0m44.482s
user 0m43.954s
sys 0m0.492s
$ time java -jar jruby-complete-1.7.24.jar diff.rb 1_to_40000000.csv 1_to_3000000.csv
...............................................................................................................................................................................................................................^C^C^Z
$ time java -Xms8000m -Xmx8000m -jar jruby-complete-1.7.24.jar diff.rb 1_to_40000000.csv 1_to_3000000.csv
................................................................................................................................................................................................................................................................................................................................................................................................................
s0.length = 40000000
..............................
s1.length = 3000000
s1 in s0 = 3000000
s1 NOT in s0 = 0
real 2m26.038s
user 11m17.241s
sys 0m7.339s
$ time ./diff.rb 1_to_40000000.csv 1_to_3000000.csv
................................................................................................................................................................................................................................................................................................................................................................................................................
s0.length = 40000000
..............................
s1.length = 3000000
s1 in s0 = 3000000
s1 NOT in s0 = 0
real 1m35.192s
user 1m32.760s
sys 0m2.407s
==== OPTIMIZED ====
$ time java -jar jruby-complete-1.7.24.jar diff.rb 1_to_40000000.csv 1_to_3000000.csv
0000000000000000000000000000000000000000111111111111111111111111111111000000000000000000000000000000000000000011111111111111111111111111111100000000000000000000000000000000000000001111111111111111111111111111110000000000000000000000000000000000000000111111111111111111111111111111000000000000000000000000000000000000000011111111111111111111111111111100000000000000000000000000000000000000001111111111111111111111111111110000000000000000000000000000000000000000111111111111111111111111111111000000000000000000000000000000000000000011111111111111111111111111111100000000000000000000000000000000000000001111111111111111111111111111110000000000000000000000000000000000000000111111111111111111111111111111
s0.length = 40000000
s1.length = 3000000
s1 in s0 = 3000000
s1 NOT in s0 = 0
real 1m36.739s
user 3m33.497s
sys 0m3.297s
==== EVEN MORE OPTIMIZED ====
$ time java -Xms2000m -Xmx2000m -jar jruby-complete-1.7.24.jar diff.complex.rb 1_to_40000000.csv 1_to_3000000.x.csv
0000000000000000000000000000000000000000111111111111111111111111111111000000000000000000000000000000000000000011111111111111111111111111111100000000000000000000000000000000000000001111111111111111111111111111110000000000000000000000000000000000000000111111111111111111111111111111000000000000000000000000000000000000000011111111111111111111111111111100000000000000000000000000000000000000001111111111111111111111111111110000000000000000000000000000000000000000111111111111111111111111111111000000000000000000000000000000000000000011111111111111111111111111111100000000000000000000000000000000000000001111111111111111111111111111110000000000000000000000000000000000000000111111111111111111111111111111
s0.length = 40000000
s1.length = 3000001
s1 in s0 = 3000000
s1 NOT in s0 = 1
111111111111111111111111111111222222222222222222222222222222333333333333333333333333333333444444444444444444444444444444555555555555555555555555555555666666666666666666666666666666777777777777777777777777777777888888888888888888888888888888999999999999999999999999999999
40000001
real 2m39.630s
user 6m28.787s
sys 0m3.623s
$ time ./diff.complex.rb 1_to_40000000.csv 1_to_3000000.x.csv
0000000000000000000000000000000000000000111111111111111111111111111111000000000000000000000000000000000000000011111111111111111111111111111100000000000000000000000000000000000000001111111111111111111111111111110000000000000000000000000000000000000000111111111111111111111111111111000000000000000000000000000000000000000011111111111111111111111111111100000000000000000000000000000000000000001111111111111111111111111111110000000000000000000000000000000000000000111111111111111111111111111111000000000000000000000000000000000000000011111111111111111111111111111100000000000000000000000000000000000000001111111111111111111111111111110000000000000000000000000000000000000000111111111111111111111111111111
s0.length = 40000000
s1.length = 3000001
s1 in s0 = 3000000
s1 NOT in s0 = 1
111111111111111111111111111111222222222222222222222222222222333333333333333333333333333333444444444444444444444444444444555555555555555555555555555555666666666666666666666666666666777777777777777777777777777777888888888888888888888888888888999999999999999999999999999999
40000001
real 3m22.208s
user 3m11.059s
sys 0m10.821s
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment