Skip to content

Instantly share code, notes, and snippets.

@arunsrinivasan
Created December 11, 2013 13:27
Show Gist options
  • Save arunsrinivasan/7910428 to your computer and use it in GitHub Desktop.
Save arunsrinivasan/7910428 to your computer and use it in GitHub Desktop.
Comparison of pandas with data.table joins (along with base:::merge and plyr:::join)
from pandas import *
from pandas.util.testing import rands
import random
N = 10000
ngroups = 10
def get_test_data(ngroups=100, n=N):
unique_groups = range(ngroups)
arr = np.asarray(np.tile(unique_groups, n / ngroups), dtype=object)
if len(arr) < n:
arr = np.asarray(list(arr) + unique_groups[:n - len(arr)],
dtype=object)
random.shuffle(arr)
return arr
from collections import defaultdict
import gc
import time
from pandas.util.testing import rands
N = 10000
indices = np.array([rands(10) for _ in xrange(N)], dtype='O')
indices2 = np.array([rands(10) for _ in xrange(N)], dtype='O')
key = np.tile(indices[:8000], 10)
key2 = np.tile(indices2[:8000], 10)
left = DataFrame({'key': key, 'key2': key2,
'value': np.random.randn(80000)})
right = DataFrame({'key': indices[2000:], 'key2': indices2[2000:],
'value2': np.random.randn(8000)})
right2 = right.append(right, ignore_index=True)
join_methods = ['inner', 'outer', 'left', 'right']
results = DataFrame(index=join_methods, columns=[False, True])
niter = 10
for sort in [False, True]:
for join_method in join_methods:
f = lambda: merge(left, right, how=join_method, sort=sort)
gc.disable()
start = time.time()
for _ in xrange(niter):
f()
elapsed = (time.time() - start) / niter
gc.enable()
results[sort][join_method] = elapsed
# results.columns = ['pandas']
results.columns = ['dont_sort', 'sort']
# R results
from StringIO import StringIO
# many to one
r_results = read_table(StringIO(""" base::merge plyr data.table
inner 0.9587 0.6214 0.0734
outer 1.4238 0.8191 0.2116
left 1.1039 0.6241 0.0528
right 1.1714 0.3678 0.0258
"""), sep='\s+')
presults = results[['dont_sort']].rename(columns={'dont_sort': 'pandas'})
all_results = presults.join(r_results)
all_results = all_results.div(all_results['pandas'], axis=0)
all_results = all_results.ix[:, ['pandas', 'data.table', 'plyr',
'base::merge']]
sort_results = DataFrame.from_items([('pandas', results['sort']),
('R', r_results['base::merge'])])
sort_results['Ratio'] = sort_results['R'] / sort_results['pandas']
nosort_results = DataFrame.from_items([('pandas', results['dont_sort']),
('R', r_results['base::merge'])])
nosort_results['Ratio'] = nosort_results['R'] / nosort_results['pandas']
all_results
# pandas data.table plyr base::merge
# inner 1 0.5662405 4.793758 7.395842
# outer 1 1.593498 6.168404 10.72222
# left 1 0.3982444 4.707279 8.326174
# right 1 0.2065636 2.944732 9.378627
# many to many
from StringIO import StringIO
# many to one
r_results = read_table(StringIO("""base::merge plyr data.table
inner 1.9289 0.6080 0.0917
outer 2.8860 0.8485 0.3490
left 2.1343 0.6279 0.0758
right 2.3700 0.3745 0.0429
"""), sep='\s+')
all_results = presults.join(r_results)
all_results = all_results.div(all_results['pandas'], axis=0)
all_results = all_results.ix[:, ['pandas', 'data.table', 'plyr',
'base::merge']]
all_results
# pandas data.table plyr base::merge
# inner 1 0.7074149 4.690385 14.8804
# outer 1 2.628218 6.389807 21.73363
# left 1 0.5717221 4.735941 16.09797
# right 1 0.343472 2.998375 18.97503
@jangorecki
Copy link

Hi Arun,
There are quite few comparisons of pandas and data.table, at least the recently performed. Could we get an update of the benchmark? if possible also R dplyr and Julia DataFrames. It would help to spread the data.table's excellence. Maybe also pdf version. No rush of course.
Kind Regards

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment