Last active
November 14, 2017 16:10
-
-
Save mdbecker/5159dba66cc5690d81a3b1ac2ff2d796 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn import metrics | |
def binary_cv_metrics(y, preds, m): | |
ACC = metrics.accuracy_score(y,preds) | |
cm = metrics.confusion_matrix(y,preds) | |
m['confusion_matrix'] = cm | |
m['Accuracy'] = ACC | |
m['F1 score'] = metrics.f1_score(y,preds) | |
m['FPR'] = cm[0,1]/(cm[0,:].sum()*1.0) | |
m['FNR'] = cm[1,0]/(cm[1,:].sum()*1.0) | |
m['Specificity (TNR)'] = cm[0,0]/(cm[0,:].sum()*1.0) | |
m['Sensitivity (TPR, Recall)'] = cm[1,1]/(cm[1,:].sum()*1.0) | |
m['PPV (Precision)'] = cm[1,1]/(cm[:,1].sum()*1.0) | |
m['NPV'] = cm[0,0]/(cm[:,0].sum()*1.0) | |
def plt_auc(pred, actual, ax): | |
fpr, tpr, thresholds = metrics.roc_curve(actual, pred[:, 1]) | |
auc = metrics.auc(fpr, tpr) | |
ax.plot(fpr, tpr) | |
ax.plot([0, 1], [0, 1], '--') | |
ax.set_xlabel('False Positive Rate') | |
ax.set_ylabel('True Positive Rate') | |
ax.text(0.7, 0.2, 'AUC = %0.2f' % (auc)) | |
test_norm['Pred'] = result.predict(test_norm[in_vars]) | |
preds = test_norm['Pred'].values | |
preds = np.array([preds,preds]).T | |
ms = [] | |
threshes = np.linspace(0,test_norm['Pred'].max(),100) | |
for thresh in threshes: | |
m = {} | |
criteria = test_norm['Pred'] > thresh | |
binary_cv_metrics(test_norm[label_name],criteria ,m) | |
m[dx_class] = n | |
ms.append(m) | |
ms_df = pd.DataFrame(ms) | |
fig, ax = plt.subplots(2,1,figsize=(8,8)) | |
for metric in ['F1 score','NPV','PPV (Precision)','Sensitivity (TPR, Recall)','Specificity (TNR)']: | |
ax[0].plot(threshes,ms_df[metric],'-',label=metric) | |
ax[0].legend(loc=0) | |
ps_analysis_utils.plt_auc(preds,test_norm[label_name],ax=ax[1]) | |
plt.show() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import statsmodels.api as sm | |
from sklearn import preprocessing | |
# You can use sklearn train_test_split to create you train and test sets: http://scikit-learn.org/stable/modules/cross_validation.html#cross-validation | |
train = X[train_idx] | |
test = X[test_idx] | |
# Scale your features so that the coefficients will be easily comparible: http://scikit-learn.org/stable/modules/preprocessing.html#preprocessing-normalization | |
scaler = preprocessing.StandardScaler().fit(train) | |
# Note: You might need to turn this back into a dataframe for line 18 to work | |
train_norm = scaler.transform(train) | |
test_norm = scaler.transform(test) | |
train_norm['intercept'] = 1 | |
test_norm['intercept'] = 1 | |
# list of columns in your DataFrame to use in training | |
in_vars = ['intercept'] + in_vars_no_intercept | |
logit = sm.Logit(train_norm[label_name],train_norm[in_vars]) | |
result = logit.fit() | |
print result.summary() | |
# remove variables with low p-values (this is called a parsimonious model https://stats.stackexchange.com/a/17570) | |
in_vars = result.pvalues.index[result.pvalues < 0.1] | |
logit = sm.Logit(train_norm[label_name],train_norm[in_vars]) | |
result = logit.fit() | |
print result.summary() | |
# remove variables with low p-values | |
in_vars = result.pvalues.index[result.pvalues < 0.05] | |
logit = sm.Logit(train_norm[label_name],train_norm[in_vars]) | |
result = logit.fit() | |
print result.summary() | |
test_norm['Pred'] = result.predict(test_norm[in_vars]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment