Created
November 4, 2011 10:36
-
-
Save pprett/1339068 to your computer and use it in GitHub Desktop.
Benchmark R's gbm module via rpy2
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Benchmark script to bench R's gbm package via rpy2. | |
NOTE:: | |
make sure you run | |
$ export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib64/R/lib | |
""" | |
import numpy as np | |
import rpy2 | |
from time import time | |
from sklearn import datasets | |
from sklearn.utils import shuffle | |
from sklearn.utils import check_random_state | |
from rpy2.robjects.numpy2ri import numpy2ri | |
from rpy2.robjects.packages import importr | |
import pylab as pl | |
gbm = importr('gbm') | |
def repeat(f): | |
def wrapper(*args, **kargs): | |
scores = [] | |
for i in range(10): | |
scores.append(f(*args, random_state=i, **kargs)) | |
scores = np.array(scores) | |
return scores.mean(axis=0), scores.std(axis=0) | |
return wrapper | |
# ignore overflows due to exp | |
#np.seterr(invalid='print', under='print', divide='print', over='ignore') | |
classification_params = {"distribution": "bernoulli", "shrinkage": 1.0, | |
"n.tree": 500, "bag.fraction": 0.5, "verbose": False, | |
"n.minobsinnode": 1, "interaction.depth": 1} | |
@repeat | |
def bench_random_gaussian(random_state=None): | |
rs = check_random_state(random_state) | |
shape = (12000, 10) | |
X = rs.normal(size=shape).reshape(shape) | |
y = ((X ** 2.0).sum(axis=1) > 9.34).astype(np.float64) | |
X_train, X_test = X[:2000], X[2000:] | |
y_train, y_test = y[:2000], y[2000:] | |
X_train = numpy2ri(X_train) | |
X_test = numpy2ri(X_test) | |
y_train = numpy2ri(y_train) | |
model = gbm.gbm_fit(X_train, y_train, **classification_params) | |
pred = gbm.predict_gbm(model, X_test, | |
**{"n.tree":classification_params["n.tree"]}) | |
pred = (np.array(pred) >= 0.0).astype(np.float64) | |
error_rate = np.mean(pred != y_test) | |
return error_rate | |
@repeat | |
def bench_spam(random_state=None): | |
X = np.loadtxt("/home/pprett/corpora/spam/spambase.data", delimiter=",") | |
y = X[:, -1].ravel() | |
X = X[:, :-1] | |
f = open("/home/pprett/corpora/spam/spambase.names") | |
feature_names = np.array([l.split(":")[0] for l in f]) | |
X, y = shuffle(X, y, random_state=random_state) | |
X_test, y_test = X[:1536], y[:1536] | |
X_train, y_train = X[1536:], y[1536:] | |
y_train[y_train == -1.0] = 0 | |
y_test[y_test == -1.0] = 0 | |
X_train = numpy2ri(X_train) | |
X_test = numpy2ri(X_test) | |
y_train = numpy2ri(y_train) | |
model = gbm.gbm_fit(X_train, y_train, **classification_params) | |
pred = gbm.predict_gbm(model, X_test, | |
**{"n.tree":classification_params["n.tree"]}) | |
pred = (np.array(pred) >= 0.0).astype(np.float64) | |
error_rate = np.mean(pred != y_test) | |
return error_rate | |
def bench_madelon(): | |
X_train = np.loadtxt("/home/pprett/corpora/madelon/madelon_train.data") | |
y_train = np.loadtxt("/home/pprett/corpora/madelon/madelon_train.labels") | |
X_test = np.loadtxt("/home/pprett/corpora/madelon/madelon_valid.data") | |
y_test = np.loadtxt("/home/pprett/corpora/madelon/madelon_valid.labels") | |
y_train[y_train == -1] = 0 | |
y_test[y_test == -1] = 0 | |
X_train = numpy2ri(X_train) | |
X_test = numpy2ri(X_test) | |
y_train = numpy2ri(y_train) | |
model = gbm.gbm_fit(X_train, y_train, **classification_params) | |
pred = gbm.predict_gbm(model, X_test, | |
**{"n.tree":classification_params["n.tree"]}) | |
pred = (np.array(pred) >= 0.0).astype(np.float64) | |
score = np.mean(pred == y_test) | |
return score | |
def bench_arcene(): | |
X_train = np.loadtxt("/home/pprett/corpora/arcene/arcene_train.data") | |
y_train = np.loadtxt("/home/pprett/corpora/arcene/arcene_train.labels") | |
X_test = np.loadtxt("/home/pprett/corpora/arcene/arcene_valid.data") | |
y_test = np.loadtxt("/home/pprett/corpora/arcene/arcene_valid.labels") | |
y_train[y_train == -1.0] = 0 | |
y_test[y_test == -1.0] = 0 | |
X_train = numpy2ri(X_train) | |
X_test = numpy2ri(X_test) | |
y_train = numpy2ri(y_train) | |
model = gbm.gbm_fit(X_train, y_train, **classification_params) | |
pred = gbm.predict_gbm(model, X_test, | |
**{"n.tree":classification_params["n.tree"]}) | |
pred = (np.array(pred) >= 0.0).astype(np.float64) | |
score = np.mean(pred == y_test) | |
return score | |
regression_params = {"distribution": "gaussian", "shrinkage": 0.1, | |
"n.tree": 100, "bag.fraction": 1.0, "verbose": False, | |
"n.minobsinnode": 1, "interaction.depth": 4} | |
@repeat | |
def bench_boston(random_state=None): | |
boston = datasets.load_boston() | |
X, y = shuffle(boston.data, boston.target, random_state=random_state) | |
offset = int(X.shape[0] * 0.9) | |
X_train = X[:offset] | |
y_train = y[:offset] | |
X_test = X[offset:] | |
y_test = y[offset:] | |
X_train = numpy2ri(X_train) | |
X_test = numpy2ri(X_test) | |
y_train = numpy2ri(y_train) | |
model = gbm.gbm_fit(X_train, y_train, **regression_params) | |
pred = gbm.predict_gbm(model, X_test, | |
**{"n.tree":regression_params["n.tree"]}) | |
pred = np.array(pred, dtype=np.float64) | |
mse = np.mean((pred - y_test) ** 2.0) | |
return mse | |
@repeat | |
def bench_friedman1(random_state=None): | |
X, y = datasets.make_friedman1(n_samples=1200, | |
random_state=random_state, noise=1.0) | |
X_train, y_train = X[:200], y[:200] | |
X_test, y_test = X[200:], y[200:] | |
X_train = numpy2ri(X_train) | |
X_test = numpy2ri(X_test) | |
y_train = numpy2ri(y_train) | |
model = gbm.gbm_fit(X_train, y_train, **regression_params) | |
pred = gbm.predict_gbm(model, X_test, | |
**{"n.tree":regression_params["n.tree"]}) | |
pred = np.array(pred, dtype=np.float64) | |
mse = np.mean((pred - y_test) ** 2.0) | |
return mse | |
@repeat | |
def bench_friedman2(random_state=None): | |
X, y = datasets.make_friedman2(n_samples=1200, random_state=random_state) | |
X_train, y_train = X[:200], y[:200] | |
X_test, y_test = X[200:], y[200:] | |
X_train = numpy2ri(X_train) | |
X_test = numpy2ri(X_test) | |
y_train = numpy2ri(y_train) | |
model = gbm.gbm_fit(X_train, y_train, **regression_params) | |
pred = gbm.predict_gbm(model, X_test, | |
**{"n.tree":regression_params["n.tree"]}) | |
pred = np.array(pred, dtype=np.float64) | |
mse = np.mean((pred - y_test) ** 2.0) | |
return mse | |
@repeat | |
def bench_friedman3(random_state=None): | |
X, y = datasets.make_friedman3(n_samples=1200, random_state=random_state) | |
X_train, y_train = X[:200], y[:200] | |
X_test, y_test = X[200:], y[200:] | |
X_train = numpy2ri(X_train) | |
X_test = numpy2ri(X_test) | |
y_train = numpy2ri(y_train) | |
model = gbm.gbm_fit(X_train, y_train, **regression_params) | |
pred = gbm.predict_gbm(model, X_test, | |
**{"n.tree":regression_params["n.tree"]}) | |
pred = np.array(pred, dtype=np.float64) | |
mse = np.mean((pred - y_test) ** 2.0) | |
return mse | |
if __name__ == "__main__": | |
print "Example 10.2", bench_random_gaussian() | |
print "spam", bench_spam() | |
print "Madelon", bench_madelon() | |
print "Arcene", bench_arcene() | |
print "Boston", bench_boston() | |
print "Friedman#1", bench_friedman1() | |
print "Friedman#2", bench_friedman2() | |
print "Friedman#3", bench_friedman3() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment