Created
October 4, 2012 04:10
-
-
Save wesm/3831420 to your computer and use it in GitHub Desktop.
Parser shootout
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# pylint: disable=W0612 | |
import time | |
import pandas as pd | |
import numpy as np | |
import iopro | |
import gc | |
def _wikipedia_options(adapter): | |
adapter.set_field_types({0:object, 1:object, 2:'i4', 3:'i8'}) | |
_iopro_extras = { | |
'wikipedia' : _wikipedia_options | |
} | |
_filenames = { | |
'zero-matrix': 'zeros.csv', | |
'double-matrix': 'matrix.csv', | |
'wikipedia': 'pagecounts-20110331-220000', | |
'fec': 'P00000001-ALL.csv', | |
'astro': 'sdss6949386.csv' | |
} | |
_delimiters = { | |
'wikipedia': ' ' | |
} | |
def pandas_timings(exclude=(), **kwds): | |
result = {} | |
for name, path in _filenames.iteritems(): | |
if name in exclude: | |
continue | |
print name | |
delim = _delimiters.get(name, ',') | |
start = time.time() | |
table = pd.read_csv(path, delimiter=delim, **kwds) | |
end = time.time() | |
result[name] = end - start | |
print '%s took %.2f sec' % (name, result[name]) | |
table = None | |
gc.collect() | |
return result | |
def iopro_timings(exclude=()): | |
result = {} | |
for name, path in _filenames.iteritems(): | |
if name in exclude: | |
continue | |
print name | |
delim = _delimiters.get(name, ',') | |
start = time.time() | |
adapter = iopro.text_adapter(path, delimiter=delim) | |
modifier = _iopro_extras.get(name) | |
if modifier: | |
modifier(adapter) | |
# read full array | |
table = adapter[:] | |
end = time.time() | |
result[name] = end - start | |
print '%s took %.2f sec' % (name, result[name]) | |
table = None | |
gc.collect() | |
return result | |
r_results = pd.Series({ | |
'zero-matrix': 0.616, | |
'double-matrix': 6.92, | |
'astro': 37.03, | |
'wikipedia': 42.25, | |
'fec': 18.121 | |
}) | |
results = { | |
'iopro': iopro_timings(), | |
'pandas': pandas_timings(), | |
'R': r_results | |
} | |
# system.time(df <- read.csv('parser_examples/zeros.csv', colClasses=rep("integer", 50))) | |
# user system elapsed | |
# 0.616 0.004 0.623 | |
# system.time(df <- read.csv('parser_examples/matrix.csv', colClasses=rep("numeric", 10))) | |
# user system elapsed | |
# 6.920 0.136 7.071 | |
# system.time(df <- read.csv('parser_examples/sdss6949386.csv', colClasses=rep("numeric", 8))) | |
# user system elapsed | |
# 37.030 0.804 37.866 | |
# system.time(df <- read.table('parser_examples/pagecounts-20110331-220000', sep=" ", | |
# header=F, | |
# colClasses=c("character", "character", "integer", "numeric"))) | |
# user system elapsed | |
# 42.250 0.356 42.651 | |
# system.time(df <- read.csv('parser_examples/P00000001-ALL.csv')) | |
# user system elapsed | |
# 18.121 0.212 18.350 | |
results = pd.DataFrame(results) | |
results_norm = results.div(results['pandas'], axis=0) | |
exclude = ['wikipedia', 'fec'] | |
results = { | |
'iopro': iopro_timings(exclude=exclude), | |
'pandas': pandas_timings(na_filter=False, as_recarray=True, | |
exclude=exclude), | |
} | |
results = pd.DataFrame(results) | |
results_norm = results.div(results['pandas'], axis=0) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment