gnilrets · November 12, 2015 18:44 · v0dro · Nov 12, 2015
diff --git a/daru-csv-benchmarks.rb b/daru-csv-benchmarks.rb
 require 'bundler/setup'

 require 'daru'
 require 'benchmark'
 require 'yaml'

 filename = 'Application_1470_20151111010006.txt'

 def bench(name, &block)
  time = Benchmark.realtime do
    yield block
  end
  puts "#{name}: #{time}"
 end


 bench :read_csv do
  # Quoting not usually needed with tab files, but there are invalid quotes in this file, so set quoting character to \x00
  count = 0
  CSV.foreach(filename, { headers: true, col_sep: "\t", encoding: "ISO-8859-1:UTF-8", quote_char: 0.chr }) do |line|
    count += 1
  end
 end

 bench :vanilla do
  df = Daru::DataFrame.from_csv filename, { headers: true, col_sep: "\t", encoding: "ISO-8859-1:UTF-8", quote_char: 0.chr }
 end

 bench :lazy_update do
  Daru.lazy_update = true
  df = Daru::DataFrame.from_csv filename, { headers: true, col_sep: "\t", encoding: "ISO-8859-1:UTF-8", quote_char: 0.chr }
  df.update
  Daru.lazy_update = false
 end

 bench :clone_false do
  df = Daru::DataFrame.from_csv filename, { headers: true, col_sep: "\t", encoding: "ISO-8859-1:UTF-8", quote_char: 0.chr, clone: false}
 end

 bench :custom_load do

  to_df = {}

  bench :build_hash_of_array do
    first_row = true
    CSV.foreach(filename, { headers: true, col_sep: "\t", encoding: "ISO-8859-1:UTF-8", quote_char: 0.chr }) do |line|
      if first_row
        line.headers.each do |header|
          to_df[header.to_sym] = []
        end
        first_row = false
      end

      line.headers.each do |col|
        to_df[col.to_sym] << line[col] if col
      end
    end
  end

  bench :build_df do
    df = Daru::DataFrame.new(to_df)
  end

 end




diff --git a/results b/results
 # The file has about 4000 records with 150 columns
 # These benchmarks run on a MacBook Pro 3GHZ i7 using Ruby 2.2.2

 read_csv: 1.061856008003815            
 vanilla: 7.537773759991978              # So converting the csv to a DF using Daru::DataFrame.from_csv increaases the read time by 7x!
 lazy_update: 12.88438387599308          # lazy_update doesn't help
 clone_false: 7.735707809988526          # nor does clone
 build_hash_of_array: 3.6907865480025066 # building a hash of arrays from the CSV takes about 3.5x times more than just reading the CSV
 build_df: 0.6319248990039341            # building a df from the hash of arrays is super fast
 custom_load: 4.322789887010003          # building a hash of arrays from the CSV and creating the DF is about 2x faster than using the .from_csv method
	require 'bundler/setup'

	require 'daru'
	require 'benchmark'
	require 'yaml'

	filename = 'Application_1470_20151111010006.txt'

	def bench(name, &block)
	time = Benchmark.realtime do
	yield block
	end
	puts "#{name}: #{time}"
	end


	bench :read_csv do
	# Quoting not usually needed with tab files, but there are invalid quotes in this file, so set quoting character to \x00
	count = 0
	CSV.foreach(filename, { headers: true, col_sep: "\t", encoding: "ISO-8859-1:UTF-8", quote_char: 0.chr }) do \|line\|
	count += 1
	end
	end

	bench :vanilla do
	df = Daru::DataFrame.from_csv filename, { headers: true, col_sep: "\t", encoding: "ISO-8859-1:UTF-8", quote_char: 0.chr }
	end

	bench :lazy_update do
	Daru.lazy_update = true
	df = Daru::DataFrame.from_csv filename, { headers: true, col_sep: "\t", encoding: "ISO-8859-1:UTF-8", quote_char: 0.chr }
	df.update
	Daru.lazy_update = false
	end

	bench :clone_false do
	df = Daru::DataFrame.from_csv filename, { headers: true, col_sep: "\t", encoding: "ISO-8859-1:UTF-8", quote_char: 0.chr, clone: false}
	end

	bench :custom_load do

	to_df = {}

	bench :build_hash_of_array do
	first_row = true
	CSV.foreach(filename, { headers: true, col_sep: "\t", encoding: "ISO-8859-1:UTF-8", quote_char: 0.chr }) do \|line\|
	if first_row
	line.headers.each do \|header\|
	to_df[header.to_sym] = []
	end
	first_row = false
	end

	line.headers.each do \|col\|
	to_df[col.to_sym] << line[col] if col
	end
	end
	end

	bench :build_df do
	df = Daru::DataFrame.new(to_df)
	end

	end
	# The file has about 4000 records with 150 columns
	# These benchmarks run on a MacBook Pro 3GHZ i7 using Ruby 2.2.2

	read_csv: 1.061856008003815
	vanilla: 7.537773759991978 # So converting the csv to a DF using Daru::DataFrame.from_csv increaases the read time by 7x!
	lazy_update: 12.88438387599308 # lazy_update doesn't help
	clone_false: 7.735707809988526 # nor does clone
	build_hash_of_array: 3.6907865480025066 # building a hash of arrays from the CSV takes about 3.5x times more than just reading the CSV
	build_df: 0.6319248990039341 # building a df from the hash of arrays is super fast
	custom_load: 4.322789887010003 # building a hash of arrays from the CSV and creating the DF is about 2x faster than using the .from_csv method