Last active
February 8, 2018 20:28
-
-
Save erogol/7267900 to your computer and use it in GitHub Desktop.
logistic regression ensembles with feature selection. It requires sklearn python lib
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def linear_model_ensemble(X, y, X_test, fold_num, fold_num_sec, grid_search_range, oobe=True, x_val=True ): | |
''' | |
X - Train set | |
y - Train set labels with. Labels are 1 for pos instances and -1 for neg instances | |
fold_num1 - Fold size for the first step X-validation to set the hyper-params | |
and feature selectors | |
fold_num2 - Fold size for the second step X-validation to test the generalization | |
performance of the ensemble | |
grid_search_range - Given list of values to be used as a C candidates in grid search | |
oobe - Use oobe values for the prediction | |
x-val - Boolean value for whether use second step X-val or not | |
''' | |
# Import Necessary Modules | |
from sklearn.cross_validation import KFold, StratifiedKFold | |
from sklearn.metrics import classification_report,roc_auc_score,accuracy_score | |
import sklearn.linear_model as lm | |
from sklearn.feature_selection import SelectPercentile | |
from sklearn.feature_selection import chi2 | |
rd = lm.LogisticRegression(dual=True, tol=1e-5, | |
fit_intercept=True, intercept_scaling=1.0, | |
class_weight=None, random_state=None) | |
# Training and feature selection | |
num_feats = X.shape[1] | |
lentrain = X.shape[0] | |
scores = np.zeros((0,)) # list of scores of classifers by test folds | |
clfs = [] # list of classifers trained by each train fold. | |
feat_selects = [] # list of feature selectors | |
kf = KFold(lentrain, n_folds=fold_num, indices=True) | |
''' | |
First X-val iteration. Each iteration trains a different classifer whose | |
parameters are optimized and discriminative features are selected | |
for the given partition of examples. | |
''' | |
for train, test in kf: | |
# set data folds | |
train_fold, test_fold, train_y, test_y = X[train], X[test], y[train], y[test] | |
# Feature selection | |
feat_select = SelectPercentile(score_func=chi2, percentile=16).fit(train_fold,train_y.astype(float)) | |
feat_selects.append(feat_select) | |
train_fold = feat_select.transform(train_fold) | |
test_fold = feat_select.transform(test_fold) | |
tuned_parameters = [{'C': grid_search_range }] | |
# Hyper parameter optimization | |
rd_fitte, score = find_best_parameters(train_fold,train_y,test_fold, test_y,rd,tuned_parameters) | |
clfs.append(rd_fitte) | |
scores = np.append(scores,score) | |
''' | |
Next step X-val to see the generalization performance of the ensemble | |
''' | |
if x_val: | |
pred_vals = [] | |
skf = StratifiedKFold(y, n_folds=fold_num_sec, indices=True) | |
clf_scores = np.array(()) | |
for train, test in skf: | |
train_fold, test_fold, train_y, test_y = X[train], X[test], y[train], y[test] | |
for counter,clf in enumerate(clfs): | |
test_fold_transed = feat_selects[counter].transform(test_fold) | |
if oobe == True: | |
pred_val = clf.predict(test_fold_transed)*scores[counter] | |
else: | |
pred_val = clf.predict(test_fold_transed) | |
if counter == 0: | |
pred_vals = pred_val | |
else: | |
pred_vals = pred_vals+pred_val | |
# Compute current fold's prediction score | |
pred = pred_vals/len(clfs) | |
clf_score = roc_auc_score(test_y.astype(float),pred) | |
clf_scores = np.append(clf_scores,clf_score) | |
# validation result | |
print "Final X-val result",clf_scores.mean() | |
''' | |
Full Training Time | |
''' | |
print "training on full data" | |
pred_all =[] | |
for counter,clf in enumerate(clfs): | |
clf.fit(X,y) | |
if oobe == True: | |
pred = clf.predict(X_test)*scores[counter] | |
else: | |
pred = clf.predict(X_test) | |
if counter ==0 : | |
pred_all = pred | |
else: | |
pred_all = pred_all+pred | |
pred_all = pred_all/len(clfs) | |
return pred_all, clfs |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment