Skip to content

Instantly share code, notes, and snippets.

from sklearn.metrics import roc_curve, auc
best_depth=dt_grid.best_params_['max_depth']
best_samples=dt_grid.best_params_['min_samples_split']
dt_1 = DecisionTreeClassifier(max_depth=best_depth,min_samples_split=best_samples)
dt_1.fit(X_train, y_train)
# roc_auc_score(y_true, y_score) the 2nd parameter should be probability estimates of the positive class
# not the predicted outputs
y_train_pred = pred_func(dt_1,X_train)
+---------------+-----------------+----------------+----------------+---------------+
| Model | Train AUC Score | Test AUC Score | Train F1 Score | Test F1 Score |
+---------------+-----------------+----------------+----------------+---------------+
| Decision_Tree | 0.9967 | 0.9909 | 0.99314 | 0.9771 |
+---------------+-----------------+----------------+----------------+---------------+
# A parameter grid for XGBoost
parameters = {
'n_estimators': [100,500,1000],
'learning_rate': [0.1, 0.01, 0.05]
}
xgb = XGBClassifier(objective='binary:logistic',
silent=True, nthread=4)
xg_grid = GridSearchCV(xgb, param_grid=parameters, n_jobs=-1, verbose=1,scoring='f1_macro',cv=3,return_train_score=True)
training score: 0.9998006310038143
testing score: 0.990791808142236
feature_important = xg1.get_booster().get_score(importance_type='weight')
keys = list(feature_important.keys())
values = list(feature_important.values())
data = pd.DataFrame(data=values, index=keys, columns=["score"]).sort_values(by = "score", ascending=False)
data.head(20)
provider_count 14960
State 5550
attend_physician_count 4258
County 3218
OPAnnualReimbursementAmt 816
total_diff_amount 800
ClmDiagnosisCode_1_count 713
OPAnnualDeductibleAmt 668
InscClaimAmtReimbursed 634
s1 = data['score']
s_s1 = sum(s1.tolist())
s_s1
plt.style.use('fivethirtyeight')
ax=data.head(20).plot(kind = 'barh' , color = 'red')
for p in ax.patches:
percentage = '{:.1f}%'.format(100 * p.get_width()/s_s1)
x = p.get_x() + p.get_width() - 0.5
y = p.get_y() + p.get_height()
+---------+-----------------+----------------+----------------+---------------+
| Model | Train AUC Score | Test AUC Score | Train F1 Score | Test F1 Score |
+---------+-----------------+----------------+----------------+---------------+
| XgBoost | 0.99938 | 0.99855 | 0.9998 | 0.990791 |
+---------+-----------------+----------------+----------------+---------------+
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
x_cfl=XGBClassifier(n_estimators=1000,nthread=-1)
x_1=XGBClassifier(n_estimators=500,nthread=-1)
x_2=XGBClassifier(n_estimators=500,nthread=-1)
x_3 = DecisionTreeClassifier(max_depth=best_depth,min_samples_split=best_samples,class_weight='balanced')
x_4 = LogisticRegression(class_weight='balanced')
s_clf = StackingClassifier(classifiers=[x_1,x_2,x_3,x_4],meta_classifier=x_cfl)
s_clf.fit(X_train,y_train)
+---------------------+-----------------+----------------+----------------+---------------+
| Model | Train AUC Score | Test AUC Score | Train F1 Score | Test F1 Score |
+---------------------+-----------------+----------------+----------------+---------------+
| Stacking_Classifier | 0.99537 | 0.9902 | 0.99429 | 0.98759 |
+---------------------+-----------------+----------------+----------------+---------------+