Last active
October 12, 2015 21:28
-
-
Save pprett/4089926 to your computer and use it in GitHub Desktop.
Benchmark script to bench scikit-learn's RandomForestClassifier against R's randomForest.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Benchmark script to bench scikit-learn's RandomForestClassifier | |
vs. R's randomForest. | |
It uses rpy2 to call R from python. Timings for randomForest are | |
pessimistic due to a constant overhead by wrapping numpy matrices | |
in R data_frames. The effect of the overhead can be reduced | |
by increasing the number of trees. | |
Note: make sure the LD_LIBRARY_PATH is set for rpy2:: | |
$ export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib64/R/lib | |
""" | |
import numpy as np | |
from time import time | |
from functools import wraps | |
from collections import defaultdict | |
from sklearn import datasets as sk_datasets | |
from sklearn.utils import shuffle | |
from sklearn.utils import check_random_state | |
from sklearn.ensemble import RandomForestClassifier | |
from sklearn.base import BaseEstimator, ClassifierMixin | |
from rpy2.robjects.numpy2ri import numpy2ri | |
from rpy2.robjects.packages import importr | |
from rpy2 import robjects as ro | |
import pylab as pl | |
rf = importr('randomForest') | |
data_path = '/home/pprett/corpora' | |
class RRandomForestClassifier(BaseEstimator, ClassifierMixin): | |
def __init__(self, **kargs): | |
self.params = kargs | |
def fit(self, X, y): | |
self.classes_ = np.unique(y) | |
y = np.searchsorted(self.classes_, y) + 1 | |
X = numpy2ri(X) | |
y = ro.FactorVector(numpy2ri(y)) | |
self.model_ = rf.randomForest(X, y, **self.params) | |
return self | |
def predict(self, X): | |
X = numpy2ri(X) | |
pred = rf.predict_randomForest(self.model_, X) | |
# R maps class labels | |
pred = np.array(pred, dtype=np.int32) - 1 | |
return self.classes_[pred] | |
def repeat(n_repetitions=3): | |
def wrap(f): | |
def wrapper(*args, **kargs): | |
scores = [] | |
for i in range(n_repetitions): | |
scores.append(f(*args, random_state=i, **kargs)) | |
scores = np.array(scores) | |
return scores.mean(axis=0), scores.std(axis=0) | |
return wraps(f)(wrapper) | |
return wrap | |
@repeat() | |
def bench_hastie_10_2(clf, random_state=None): | |
X, y = sk_datasets.make_hastie_10_2(random_state=random_state) | |
X_train, X_test = X[:2000], X[2000:] | |
y_train, y_test = y[:2000], y[2000:] | |
X_train = np.asarray(X_train, order='f', dtype=np.float32) | |
X_test = np.asarray(X_test, dtype=np.float32) | |
t0 = time() | |
clf.fit(X_train, y_train) | |
train_time = time() - t0 | |
t0 = time() | |
pred = clf.predict(X_test) | |
test_time = time() - t0 | |
error_rate = np.mean(pred != y_test) | |
return error_rate, train_time, test_time | |
@repeat() | |
def bench_random_gaussian(clf, random_state=None): | |
rs = check_random_state(random_state) | |
shape = (12000, 10) | |
X = rs.normal(size=shape).reshape(shape) | |
y = ((X ** 2.0).sum(axis=1) > 9.34).astype(np.int32) | |
X_train, X_test = X[:2000], X[2000:] | |
y_train, y_test = y[:2000], y[2000:] | |
X_train = np.asarray(X_train, order='f', dtype=np.float32) | |
X_test = np.asarray(X_test, dtype=np.float32) | |
t0 = time() | |
clf.fit(X_train, y_train) | |
train_time = time() - t0 | |
t0 = time() | |
pred = clf.predict(X_test) | |
test_time = time() - t0 | |
error_rate = np.mean(pred != y_test) | |
return error_rate, train_time, test_time | |
@repeat() | |
def bench_spam(clf, random_state=None): | |
X = np.loadtxt(data_path + "/spam/spambase.data", delimiter=",") | |
y = X[:, -1].ravel() | |
X = X[:, :-1] | |
f = open(data_path + "/spam/spambase.names") | |
feature_names = np.array([l.split(":")[0] for l in f]) | |
X, y = shuffle(X, y, random_state=random_state) | |
X_test, y_test = X[:1536], y[:1536] | |
X_train, y_train = X[1536:], y[1536:] | |
X_train = np.asarray(X_train, order='f', dtype=np.float32) | |
X_test = np.asarray(X_test, dtype=np.float32) | |
t0 = time() | |
clf.fit(X_train, y_train) | |
train_time = time() - t0 | |
t0 = time() | |
error_rate = (1.0 - clf.score(X_test, y_test)) | |
test_time = time() - t0 | |
return error_rate, train_time, test_time | |
@repeat() | |
def bench_madelon(clf, random_state=None): | |
X_train = np.loadtxt(data_path + "/madelon/madelon_train.data") | |
y_train = np.loadtxt(data_path + "/madelon/madelon_train.labels") | |
X_test = np.loadtxt(data_path + "/madelon/madelon_valid.data") | |
y_test = np.loadtxt(data_path + "/madelon/madelon_valid.labels") | |
X_train = np.asarray(X_train, order='f', dtype=np.float32) | |
X_test = np.asarray(X_test, dtype=np.float32) | |
t0 = time() | |
clf.fit(X_train, y_train) | |
train_time = time() - t0 | |
t0 = time() | |
error_rate = (1.0 - clf.score(X_test, y_test)) | |
test_time = time() - t0 | |
return error_rate, train_time, test_time | |
@repeat() | |
def bench_arcene(clf, random_state=None): | |
X_train = np.loadtxt(data_path + "/arcene/arcene_train.data") | |
y_train = np.loadtxt(data_path + "/arcene/arcene_train.labels") | |
X_test = np.loadtxt(data_path + "/arcene/arcene_valid.data") | |
y_test = np.loadtxt(data_path + "/arcene/arcene_valid.labels") | |
X_train = np.asarray(X_train, order='f', dtype=np.float32) | |
X_test = np.asarray(X_test, dtype=np.float32) | |
t0 = time() | |
clf.fit(X_train, y_train) | |
train_time = time() - t0 | |
t0 = time() | |
error_rate = (1.0 - clf.score(X_test, y_test)) | |
test_time = time() - t0 | |
return error_rate, train_time, test_time | |
@repeat() | |
def bench_landsat(clf, random_state=None): | |
landsat = sk_datasets.load_landsat() | |
X = np.asarray(landsat.data, order='f', dtype=np.float32) | |
y = landsat.target | |
t0 = time() | |
clf.fit(X, y) | |
train_time = time() - t0 | |
t0 = time() | |
error_rate = (1.0 - clf.score(X, y)) | |
test_time = time() - t0 | |
return error_rate, train_time, test_time | |
@repeat(3) | |
def bench_mnist(clf, random_state=None): | |
rs = check_random_state(random_state) | |
mnist = sk_datasets.fetch_mldata('MNIST original') | |
inds = np.arange(len(mnist.data)) | |
rs.shuffle(inds) | |
cut_off = int(0.9 * len(inds)) | |
train_i = inds[:cut_off] | |
test_i = inds[cut_off:] | |
X_train = np.asfortranarray(mnist.data[train_i], dtype=np.float32) | |
y_train = mnist.target[train_i].astype(np.double) | |
X_test = mnist.data[test_i].astype(np.float32) | |
y_test = mnist.target[test_i].astype(np.double) | |
t0 = time() | |
clf.fit(X_train, y_train) | |
train_time = time() - t0 | |
t0 = time() | |
error_rate = (1.0 - clf.score(X_test, y_test)) | |
test_time = time() - t0 | |
return error_rate, train_time, test_time | |
if __name__ == '__main__': | |
clfs = {'r': RRandomForestClassifier(ntree=100, mtry=3, nodesize=1), | |
'py': RandomForestClassifier(n_estimators=100, max_features=3, | |
min_samples_leaf=1, | |
n_jobs=1)} | |
datasets = {'random_gaussian': bench_random_gaussian, | |
'spam': bench_spam, | |
'madelon': bench_madelon, | |
'arcene': bench_arcene, | |
'landsat': bench_landsat, | |
'hastie_10_2': bench_hastie_10_2} | |
res = defaultdict(dict) | |
for impl, clf in clfs.iteritems(): | |
for dataset, ds_bench in datasets.iteritems(): | |
mean, std = ds_bench(clf) | |
res[dataset][impl] = (mean, std) | |
clfs = {'r': RRandomForestClassifier(ntree=10, mtry=3, nodesize=1), | |
'py': RandomForestClassifier(n_estimators=10, max_features=3, | |
min_samples_leaf=1, | |
n_jobs=1)} | |
datasets = {'mnist': bench_mnist} | |
for impl, clf in clfs.iteritems(): | |
for dataset, ds_bench in datasets.iteritems(): | |
mean, std = ds_bench(clf) | |
res[dataset][impl] = (mean, std) | |
for ds in res: | |
print('_' * 80) | |
print(ds) | |
print("%s\t%s\t%s" % (' '*4, 'r'.center(13), 'py'.center(13))) | |
for i, metric in enumerate(['score', 'train', 'test']): | |
print("%s\t%.4f (%.2f)\t%.4f (%.2f)" % | |
(metric, res[ds]['r'][0][i], res[ds]['r'][1][i], | |
res[ds]['py'][0][i], res[ds]['py'][1][i])) | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
________________________________________________________________________________ | |
arcene | |
r py | |
score 0.2633 (0.01) 0.2900 (0.00) | |
train 4.0266 (0.10) 7.8897 (0.02) | |
test 0.2117 (0.00) 0.0089 (0.00) | |
________________________________________________________________________________ | |
landsat | |
r py | |
score 0.0243 (0.00) 0.0548 (0.00) | |
train 2.3900 (0.04) 5.9089 (0.05) | |
test 0.1120 (0.00) 0.1043 (0.00) | |
________________________________________________________________________________ | |
spam | |
r py | |
score 0.0558 (0.00) 0.0664 (0.01) | |
train 1.6730 (0.02) 2.7901 (0.02) | |
test 0.0367 (0.00) 0.0235 (0.00) | |
________________________________________________________________________________ | |
random_gaussian | |
r py | |
score 0.1446 (0.00) 0.1486 (0.01) | |
train 0.3401 (0.01) 1.2787 (0.01) | |
test 0.1513 (0.00) 0.1279 (0.00) | |
________________________________________________________________________________ | |
madelon | |
r py | |
score 0.4300 (0.01) 0.3928 (0.00) | |
train 10.3250 (0.03) 19.6157 (0.25) | |
test 0.0789 (0.01) 0.0170 (0.00) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
________________________________________________________________________________ | |
arcene | |
r py | |
score 0.2300 (0.03) 0.2800 (0.01) | |
train 4.4058 (0.13) 3.7385 (0.11) | |
test 0.2299 (0.00) 0.2674 (0.05) | |
________________________________________________________________________________ | |
landsat | |
r py | |
score 0.0243 (0.00) 0.0540 (0.00) | |
train 2.4721 (0.04) 3.1411 (0.09) | |
test 0.1239 (0.01) 0.3311 (0.00) | |
________________________________________________________________________________ | |
spam | |
r py | |
score 0.0558 (0.00) 0.0649 (0.00) | |
train 1.7126 (0.01) 1.7856 (0.05) | |
test 0.0405 (0.00) 0.2319 (0.00) | |
________________________________________________________________________________ | |
random_gaussian | |
r py | |
score 0.1444 (0.00) 0.1514 (0.01) | |
train 0.3567 (0.01) 1.1473 (0.10) | |
test 0.1646 (0.01) 0.2301 (0.00) | |
________________________________________________________________________________ | |
madelon | |
r py | |
score 0.4211 (0.02) 0.3839 (0.02) | |
train 10.3918 (0.16) 8.3439 (0.09) | |
test 0.0871 (0.01) 0.2390 (0.00) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment