Skip to content

Instantly share code, notes, and snippets.

@betatim
Created February 19, 2015 12:46
Show Gist options
  • Save betatim/a31777c36e3b4b6f21bb to your computer and use it in GitHub Desktop.
Save betatim/a31777c36e3b4b6f21bb to your computer and use it in GitHub Desktop.
import random
import pandas as pd
import numpy as np
import pandas.core.common as com
from pandas.core.index import Index
from pandas.tools import plotting
from pandas.tools.plotting import scatter_matrix
from sklearn import cross_validation
from sklearn.grid_search import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.base import TransformerMixin, BaseEstimator, ClassifierMixin
var_names = """lepton pT, lepton eta, lepton phi, missing energy magnitude,
missing energy phi, jet 1 pt, jet 1 eta, jet 1 phi, jet 1 b-tag, jet 2
pt, jet 2 eta, jet 2 phi, jet 2 b-tag, jet 3 pt, jet 3 eta, jet 3 phi,
jet 3 b-tag, jet 4 pt, jet 4 eta, jet 4 phi, jet 4 b-tag, m_jj, m_jjj,
m_lv, m_jlv, m_bb, m_wbb, m_wwbb""".split(",")
var_names = [c.strip() for c in var_names]
df = pd.read_csv("/Users/thead/Downloads/HIGGS-small.csv", engine='c',
names=['y'] + var_names)
# set aside a eval sample
X_dev, X_eval, y_dev, y_eval = cross_validation.train_test_split(df.as_matrix(var_names),
df.y,
test_size=0.33,
random_state=78534)
clf = GradientBoostingClassifier(n_estimators=3000,
max_depth=1,
subsample=0.5,
max_features=0.5,
learning_rate=0.4)
shuffle_split = cross_validation.ShuffleSplit(X_dev.shape[0],
n_iter=3,
test_size=0.33)
kfold_shuffle = cross_validation.KFold(X_dev.shape[0],
n_folds=3,
shuffle=True)
kfold = cross_validation.KFold(X_dev.shape[0],
n_folds=3,
shuffle=False)
for cv in (3, shuffle_split, kfold, kfold_shuffle):
scores = cross_validation.cross_val_score(clf, X_dev, y_dev,
scoring="roc_auc",
n_jobs=6,
cv=cv)
print cv
print scores
print "-"*80
# split our development sample into a test and train set
XXyy = cross_validation.train_test_split(X_dev,
y_dev,
test_size=0.5,
random_state=7853354)
X_train,X_test, y_train,y_test = XXyy
clf.fit(X_train, y_train)
print "train:", roc_auc_score(y_train, clf.predict(X_train))
print "test:", roc_auc_score(y_test, clf.predict(X_test))
@amueller
Copy link

You shouldn't use ".asmatrix" which will create a numpy matrix, which is not an accepted datatype for scikit-learn (I guess we convert it to an array). You should use .asarray or just .values.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment