Created
November 22, 2016 07:58
-
-
Save DonerKebab/31134610a1f6ba5bbc7093026d13a732 to your computer and use it in GitHub Desktop.
Fastest Python library to read a CSV file
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import csv | |
import os | |
import cProfile | |
import time | |
import numpy | |
import pandas | |
import warnings | |
# Make sure those files in the same folder as benchmark_python.py | |
# As the name indicates: | |
# - '1col.csv' is a CSV file with 1 column | |
# - '3col.csv' is a CSV file with 3 column | |
filename1 = '1col.csv' | |
filename3 = '3col.csv' | |
csv_delimiter = ' ' | |
debug = False | |
def open_with_python_csv(filename): | |
''' | |
https://docs.python.org/2/library/csv.html | |
''' | |
data =[] | |
with open(filename, 'rb') as csvfile: | |
csvreader = csv.reader(csvfile, delimiter=csv_delimiter, quotechar='|') | |
for row in csvreader: | |
data.append(row) | |
return data | |
def open_with_python_csv_cast_as_float(filename): | |
''' | |
https://docs.python.org/2/library/csv.html | |
''' | |
data =[] | |
with open(filename, 'rb') as csvfile: | |
csvreader = csv.reader(csvfile, delimiter=csv_delimiter, quotechar='|') | |
for row in csvreader: | |
data.append(map(float, row)) | |
return data | |
def open_with_python_csv_list(filename): | |
''' | |
https://docs.python.org/2/library/csv.html | |
''' | |
data =[] | |
with open(filename, 'rb') as csvfile: | |
csvreader = csv.reader(csvfile, delimiter=csv_delimiter, quotechar='|') | |
data = list(csvreader) | |
return data | |
def open_with_numpy_loadtxt(filename): | |
''' | |
http://stackoverflow.com/questions/4315506/load-csv-into-2d-matrix-with-numpy-for-plotting | |
''' | |
data = numpy.loadtxt(open(filename,'rb'),delimiter=csv_delimiter,skiprows=0) | |
return data | |
def open_with_pandas_read_csv(filename): | |
df = pandas.read_csv(filename, sep=csv_delimiter) | |
data = df.values | |
return data | |
def benchmark(function_name): | |
start_time = time.clock() | |
data = function_name(filename1) | |
if debug: print data[0] | |
data = function_name(filename3) | |
if debug: print data[0] | |
print function_name.__name__ + ': ' + str(time.clock() - start_time), "seconds" | |
def benchmark_numpy_fromfile(): | |
''' | |
http://docs.scipy.org/doc/numpy/reference/generated/numpy.fromfile.html | |
Do not rely on the combination of tofile and fromfile for data storage, | |
as the binary files generated are are not platform independent. | |
In particular, no byte-order or data-type information is saved. | |
Data can be stored in the platform independent .npy format using | |
save and load instead. | |
Note that fromfile will create a one-dimensional array containing your data, | |
so you might need to reshape it afterward. | |
''' | |
#ignore the 'tmpnam is a potential security risk to your program' warning | |
with warnings.catch_warnings(): | |
warnings.simplefilter('ignore', RuntimeWarning) | |
fname1 = os.tmpnam() | |
fname3 = os.tmpnam() | |
data = open_with_numpy_loadtxt(filename1) | |
if debug: print data[0] | |
data.tofile(fname1) | |
data = open_with_numpy_loadtxt(filename3) | |
if debug: print data[0] | |
data.tofile(fname3) | |
if debug: print data.shape | |
fname3shape = data.shape | |
start_time = time.clock() | |
data = numpy.fromfile(fname1, dtype=numpy.float64) # you might need to switch to float32. List of types: http://docs.scipy.org/doc/numpy/reference/arrays.dtypes.html | |
if debug: print len(data), data[0], data.shape | |
data = numpy.fromfile(fname3, dtype=numpy.float64) | |
data = data.reshape(fname3shape) | |
if debug: print len(data), data[0], data.shape | |
print 'Numpy fromfile: ' + str(time.clock() - start_time), "seconds" | |
def benchmark_numpy_save_load(): | |
''' | |
http://docs.scipy.org/doc/numpy/reference/generated/numpy.fromfile.html | |
Do not rely on the combination of tofile and fromfile for data storage, | |
as the binary files generated are are not platform independent. | |
In particular, no byte-order or data-type information is saved. | |
Data can be stored in the platform independent .npy format using | |
save and load instead. | |
Note that fromfile will create a one-dimensional array containing your data, | |
so you might need to reshape it afterward. | |
''' | |
#ignore the 'tmpnam is a potential security risk to your program' warning | |
with warnings.catch_warnings(): | |
warnings.simplefilter('ignore', RuntimeWarning) | |
fname1 = os.tmpnam() | |
fname3 = os.tmpnam() | |
data = open_with_numpy_loadtxt(filename1) | |
if debug: print data[0] | |
numpy.save(fname1, data) | |
data = open_with_numpy_loadtxt(filename3) | |
if debug: print data[0] | |
numpy.save(fname3, data) | |
if debug: print data.shape | |
fname3shape = data.shape | |
start_time = time.clock() | |
data = numpy.load(fname1 + '.npy') | |
if debug: print len(data), data[0], data.shape | |
data = numpy.load(fname3 + '.npy') | |
#data = data.reshape(fname3shape) | |
if debug: print len(data), data[0], data.shape | |
print 'Numpy load: ' + str(time.clock() - start_time), "seconds" | |
def main(): | |
number_of_runs = 20 | |
results = [] | |
benchmark_functions = ['benchmark(open_with_python_csv)', | |
'benchmark(open_with_python_csv_list)', | |
'benchmark(open_with_python_csv_cast_as_float)', | |
'benchmark(open_with_numpy_loadtxt)', | |
'benchmark(open_with_pandas_read_csv)', | |
'benchmark_numpy_fromfile()', | |
'benchmark_numpy_save_load()'] | |
# Compute benchmark | |
for run_number in range(number_of_runs): | |
run_results = [] | |
for benchmark_function in benchmark_functions: | |
run_results.append(eval(benchmark_function)) | |
results.append(run_results) | |
# Display benchmark's results | |
print results | |
results = numpy.array(results) | |
numpy.set_printoptions(precision=10) # http://stackoverflow.com/questions/2891790/pretty-printing-of-numpy-array | |
numpy.set_printoptions(suppress=True) # suppress suppresses the use of scientific notation for small numbers: | |
print numpy.mean(results, axis=0) | |
print numpy.std(results, axis=0) | |
#Another library, but not free: https://store.continuum.io/cshop/iopro/ | |
if __name__ == "__main__": | |
#cProfile.run('main()') # if you want to do some profiling | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment