Created
February 19, 2015 12:46
-
-
Save betatim/a31777c36e3b4b6f21bb to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import random | |
import pandas as pd | |
import numpy as np | |
import pandas.core.common as com | |
from pandas.core.index import Index | |
from pandas.tools import plotting | |
from pandas.tools.plotting import scatter_matrix | |
from sklearn import cross_validation | |
from sklearn.grid_search import GridSearchCV | |
from sklearn.tree import DecisionTreeClassifier | |
from sklearn.ensemble import AdaBoostClassifier | |
from sklearn.ensemble import GradientBoostingClassifier | |
from sklearn.pipeline import Pipeline, FeatureUnion | |
from sklearn.metrics import classification_report, roc_auc_score | |
from sklearn.base import TransformerMixin, BaseEstimator, ClassifierMixin | |
var_names = """lepton pT, lepton eta, lepton phi, missing energy magnitude, | |
missing energy phi, jet 1 pt, jet 1 eta, jet 1 phi, jet 1 b-tag, jet 2 | |
pt, jet 2 eta, jet 2 phi, jet 2 b-tag, jet 3 pt, jet 3 eta, jet 3 phi, | |
jet 3 b-tag, jet 4 pt, jet 4 eta, jet 4 phi, jet 4 b-tag, m_jj, m_jjj, | |
m_lv, m_jlv, m_bb, m_wbb, m_wwbb""".split(",") | |
var_names = [c.strip() for c in var_names] | |
df = pd.read_csv("/Users/thead/Downloads/HIGGS-small.csv", engine='c', | |
names=['y'] + var_names) | |
# set aside a eval sample | |
X_dev, X_eval, y_dev, y_eval = cross_validation.train_test_split(df.as_matrix(var_names), | |
df.y, | |
test_size=0.33, | |
random_state=78534) | |
clf = GradientBoostingClassifier(n_estimators=3000, | |
max_depth=1, | |
subsample=0.5, | |
max_features=0.5, | |
learning_rate=0.4) | |
shuffle_split = cross_validation.ShuffleSplit(X_dev.shape[0], | |
n_iter=3, | |
test_size=0.33) | |
kfold_shuffle = cross_validation.KFold(X_dev.shape[0], | |
n_folds=3, | |
shuffle=True) | |
kfold = cross_validation.KFold(X_dev.shape[0], | |
n_folds=3, | |
shuffle=False) | |
for cv in (3, shuffle_split, kfold, kfold_shuffle): | |
scores = cross_validation.cross_val_score(clf, X_dev, y_dev, | |
scoring="roc_auc", | |
n_jobs=6, | |
cv=cv) | |
print cv | |
print scores | |
print "-"*80 | |
# split our development sample into a test and train set | |
XXyy = cross_validation.train_test_split(X_dev, | |
y_dev, | |
test_size=0.5, | |
random_state=7853354) | |
X_train,X_test, y_train,y_test = XXyy | |
clf.fit(X_train, y_train) | |
print "train:", roc_auc_score(y_train, clf.predict(X_train)) | |
print "test:", roc_auc_score(y_test, clf.predict(X_test)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
You shouldn't use ".asmatrix" which will create a numpy matrix, which is not an accepted datatype for scikit-learn (I guess we convert it to an array). You should use .asarray or just .values.