Last active
November 12, 2015 18:44
-
-
Save gnilrets/611d85d5cb87fa31bb8a to your computer and use it in GitHub Desktop.
Daru-CSV-Benchmarks
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'bundler/setup' | |
require 'daru' | |
require 'benchmark' | |
require 'yaml' | |
filename = 'Application_1470_20151111010006.txt' | |
def bench(name, &block) | |
time = Benchmark.realtime do | |
yield block | |
end | |
puts "#{name}: #{time}" | |
end | |
bench :read_csv do | |
# Quoting not usually needed with tab files, but there are invalid quotes in this file, so set quoting character to \x00 | |
count = 0 | |
CSV.foreach(filename, { headers: true, col_sep: "\t", encoding: "ISO-8859-1:UTF-8", quote_char: 0.chr }) do |line| | |
count += 1 | |
end | |
end | |
bench :vanilla do | |
df = Daru::DataFrame.from_csv filename, { headers: true, col_sep: "\t", encoding: "ISO-8859-1:UTF-8", quote_char: 0.chr } | |
end | |
bench :lazy_update do | |
Daru.lazy_update = true | |
df = Daru::DataFrame.from_csv filename, { headers: true, col_sep: "\t", encoding: "ISO-8859-1:UTF-8", quote_char: 0.chr } | |
df.update | |
Daru.lazy_update = false | |
end | |
bench :clone_false do | |
df = Daru::DataFrame.from_csv filename, { headers: true, col_sep: "\t", encoding: "ISO-8859-1:UTF-8", quote_char: 0.chr, clone: false} | |
end | |
bench :custom_load do | |
to_df = {} | |
bench :build_hash_of_array do | |
first_row = true | |
CSV.foreach(filename, { headers: true, col_sep: "\t", encoding: "ISO-8859-1:UTF-8", quote_char: 0.chr }) do |line| | |
if first_row | |
line.headers.each do |header| | |
to_df[header.to_sym] = [] | |
end | |
first_row = false | |
end | |
line.headers.each do |col| | |
to_df[col.to_sym] << line[col] if col | |
end | |
end | |
end | |
bench :build_df do | |
df = Daru::DataFrame.new(to_df) | |
end | |
end | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# The file has about 4000 records with 150 columns | |
# These benchmarks run on a MacBook Pro 3GHZ i7 using Ruby 2.2.2 | |
read_csv: 1.061856008003815 | |
vanilla: 7.537773759991978 # So converting the csv to a DF using Daru::DataFrame.from_csv increaases the read time by 7x! | |
lazy_update: 12.88438387599308 # lazy_update doesn't help | |
clone_false: 7.735707809988526 # nor does clone | |
build_hash_of_array: 3.6907865480025066 # building a hash of arrays from the CSV takes about 3.5x times more than just reading the CSV | |
build_df: 0.6319248990039341 # building a df from the hash of arrays is super fast | |
custom_load: 4.322789887010003 # building a hash of arrays from the CSV and creating the DF is about 2x faster than using the .from_csv method |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Dont call
df.update
ifDaru.lazy_update = true
.Another thing:
Once you convert to a hash and pass to DataFrame (in
:build_df
), could you also try passing theclone: false
option? It should be even faster.Would like the results when this is done.