Last active
December 25, 2015 17:09
-
-
Save dylanjf/7011219 to your computer and use it in GitHub Desktop.
3 rep 10 fold CV
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
########3 rep 10 fold CV to determine feature sparsity percentage via RFE######### | |
#X = concatenated text features for training set (title, body, url) transformed via TfIdfVectorizer | |
#y = training set classification (0, 1) | |
import numpy as np | |
import pandas as pd | |
import sklearn.linear_model as lm | |
from sklearn.cross_validation import KFold | |
from sklearn import metrics | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
#...code here to gen X and y | |
model = lm.LogisticRegression(penalty='l2', dual=True, tol=0.0001, | |
C=1, fit_intercept=True, intercept_scaling=1.0, | |
class_weight=None, random_state=None) | |
#generating fold indicies | |
seed = 42 | |
mean_auc = [0] * 100 | |
for i in range(1,4): | |
#initial CV split | |
cv_split = KFold(len(y), n_folds = 10, indices = True, random_state = seed*i) | |
#Within each fold: | |
#1) Obtain train / test splits for the fold | |
#2) Train LogReg model on entire train. record AUC on test set | |
#3) Obtain indicies for sorted abs() of the LogReg coefficients | |
#4) For 1-99%, train model on that amt of removed features from training. record AUC on test set | |
for train_index, test_index in cv_split: | |
X_cv_train, X_cv_test = X[train_index], X[test_index] | |
Y_cv_train, Y_cv_test = y[train_index], y[test_index] | |
log_fit = model.fit(X_cv_train, Y_cv_train) | |
coef = model.coef_.ravel(log_fit) | |
important_coef = np.argsort(np.abs(coef)) | |
for j in range(len(mean_auc)): | |
important_coef_subset = important_coef[-int(len(important_coef) * (1 - j/100.0)):] | |
X_cv_train_subset, X_cv_test_subset = X_cv_train[:,important_coef_subset], X_cv_test[:,important_coef_subset] | |
log_fit = model.fit(X_cv_train_subset, Y_cv_train) | |
pred = model.predict_proba(X_cv_test_subset)[:,1] | |
mean_auc[j] += metrics.roc_auc_score(Y_cv_test, pred) / float(10) | |
print "Fold set %d complete." % i | |
mean_auc = [mean_auc[x] / 3 for x in range(len(mean_auc))] | |
best_pct = np.argsort(mean_auc)[-1] | |
plot(mean_auc) | |
best_pct |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment