Created
December 11, 2013 13:27
-
-
Save arunsrinivasan/7910428 to your computer and use it in GitHub Desktop.
Comparison of pandas with data.table joins (along with base:::merge and plyr:::join)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pandas import * | |
from pandas.util.testing import rands | |
import random | |
N = 10000 | |
ngroups = 10 | |
def get_test_data(ngroups=100, n=N): | |
unique_groups = range(ngroups) | |
arr = np.asarray(np.tile(unique_groups, n / ngroups), dtype=object) | |
if len(arr) < n: | |
arr = np.asarray(list(arr) + unique_groups[:n - len(arr)], | |
dtype=object) | |
random.shuffle(arr) | |
return arr | |
from collections import defaultdict | |
import gc | |
import time | |
from pandas.util.testing import rands | |
N = 10000 | |
indices = np.array([rands(10) for _ in xrange(N)], dtype='O') | |
indices2 = np.array([rands(10) for _ in xrange(N)], dtype='O') | |
key = np.tile(indices[:8000], 10) | |
key2 = np.tile(indices2[:8000], 10) | |
left = DataFrame({'key': key, 'key2': key2, | |
'value': np.random.randn(80000)}) | |
right = DataFrame({'key': indices[2000:], 'key2': indices2[2000:], | |
'value2': np.random.randn(8000)}) | |
right2 = right.append(right, ignore_index=True) | |
join_methods = ['inner', 'outer', 'left', 'right'] | |
results = DataFrame(index=join_methods, columns=[False, True]) | |
niter = 10 | |
for sort in [False, True]: | |
for join_method in join_methods: | |
f = lambda: merge(left, right, how=join_method, sort=sort) | |
gc.disable() | |
start = time.time() | |
for _ in xrange(niter): | |
f() | |
elapsed = (time.time() - start) / niter | |
gc.enable() | |
results[sort][join_method] = elapsed | |
# results.columns = ['pandas'] | |
results.columns = ['dont_sort', 'sort'] | |
# R results | |
from StringIO import StringIO | |
# many to one | |
r_results = read_table(StringIO(""" base::merge plyr data.table | |
inner 0.9587 0.6214 0.0734 | |
outer 1.4238 0.8191 0.2116 | |
left 1.1039 0.6241 0.0528 | |
right 1.1714 0.3678 0.0258 | |
"""), sep='\s+') | |
presults = results[['dont_sort']].rename(columns={'dont_sort': 'pandas'}) | |
all_results = presults.join(r_results) | |
all_results = all_results.div(all_results['pandas'], axis=0) | |
all_results = all_results.ix[:, ['pandas', 'data.table', 'plyr', | |
'base::merge']] | |
sort_results = DataFrame.from_items([('pandas', results['sort']), | |
('R', r_results['base::merge'])]) | |
sort_results['Ratio'] = sort_results['R'] / sort_results['pandas'] | |
nosort_results = DataFrame.from_items([('pandas', results['dont_sort']), | |
('R', r_results['base::merge'])]) | |
nosort_results['Ratio'] = nosort_results['R'] / nosort_results['pandas'] | |
all_results | |
# pandas data.table plyr base::merge | |
# inner 1 0.5662405 4.793758 7.395842 | |
# outer 1 1.593498 6.168404 10.72222 | |
# left 1 0.3982444 4.707279 8.326174 | |
# right 1 0.2065636 2.944732 9.378627 | |
# many to many | |
from StringIO import StringIO | |
# many to one | |
r_results = read_table(StringIO("""base::merge plyr data.table | |
inner 1.9289 0.6080 0.0917 | |
outer 2.8860 0.8485 0.3490 | |
left 2.1343 0.6279 0.0758 | |
right 2.3700 0.3745 0.0429 | |
"""), sep='\s+') | |
all_results = presults.join(r_results) | |
all_results = all_results.div(all_results['pandas'], axis=0) | |
all_results = all_results.ix[:, ['pandas', 'data.table', 'plyr', | |
'base::merge']] | |
all_results | |
# pandas data.table plyr base::merge | |
# inner 1 0.7074149 4.690385 14.8804 | |
# outer 1 2.628218 6.389807 21.73363 | |
# left 1 0.5717221 4.735941 16.09797 | |
# right 1 0.343472 2.998375 18.97503 | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hi Arun,
There are quite few comparisons of pandas and data.table, at least the recently performed. Could we get an update of the benchmark? if possible also R dplyr and Julia DataFrames. It would help to spread the data.table's excellence. Maybe also pdf version. No rush of course.
Kind Regards