Skip to content

Instantly share code, notes, and snippets.

ClmProcedureCode_1_count = patient_data['ClmProcedureCode_1'].value_counts().to_dict()
patient_data['ClmProcedureCode_1_count']=patient_data['ClmProcedureCode_1'].map(ClmProcedureCode_1_count)
ClmProcedureCode_2_count = patient_data['ClmProcedureCode_2'].value_counts().to_dict()
patient_data['ClmProcedureCode_2_count']=patient_data['ClmProcedureCode_2'].map(ClmProcedureCode_2_count)
ClmProcedureCode_3_count = patient_data['ClmProcedureCode_3'].value_counts().to_dict()
patient_data['ClmProcedureCode_3_count']=patient_data['ClmProcedureCode_3'].map(ClmProcedureCode_3_count)
ClmDiagnosisCode_1_count = patient_data['ClmDiagnosisCode_1'].value_counts().to_dict()
col_to_remove = ['Provider','BeneID', 'ClaimID', 'ClaimStartDt','ClaimEndDt','AttendingPhysician',\
'OperatingPhysician', 'OtherPhysician','ClmAdmitDiagnosisCode','NoOfMonths_PartACov',\
'NoOfMonths_PartBCov','DiagnosisGroupCode','AdmissionDt','DischargeDt']
diagnosis_codes = ['ClmDiagnosisCode_1', 'ClmDiagnosisCode_2', 'ClmDiagnosisCode_3',
'ClmDiagnosisCode_4', 'ClmDiagnosisCode_5', 'ClmDiagnosisCode_6',
'ClmDiagnosisCode_7', 'ClmDiagnosisCode_8', 'ClmDiagnosisCode_9',
'ClmDiagnosisCode_10']
procedure_codes = ['ClmProcedureCode_1','ClmProcedureCode_2','ClmProcedureCode_3','ClmProcedureCode_4','ClmProcedureCode_5','ClmProcedureCode_6']
oth_cols = ['DOB','DOD','Claim_Start','Claim_Start','Admission_Date','Admission_Date','Claim_End','Discharge_Date']
min_max_scaler = preprocessing.MinMaxScaler()
def scale_fun(X_train,X_test,col):
min_max_scaler.fit(X_train[col].values.reshape(-1,1))
X_train_=min_max_scaler.transform(X_train[col].values.reshape(-1,1))
X_test_=min_max_scaler.transform(X_test[col].values.reshape(-1,1))
return X_train_,X_test_
for col in X_train.columns:
parameters = {'max_depth':[1, 5, 10, 20, 50],
'min_samples_split':[5, 10, 100, 500]}
dt = DecisionTreeClassifier()
dt_grid = GridSearchCV(dt, param_grid=parameters, n_jobs=-1, verbose=1,scoring='f1_macro',cv=3,return_train_score=True)
dt_grid.fit(X_train,y_train)
best_depth=dt_grid.best_params_['max_depth']
best_samples=dt_grid.best_params_['min_samples_split']
from sklearn.metrics import roc_curve, auc
best_depth=dt_grid.best_params_['max_depth']
best_samples=dt_grid.best_params_['min_samples_split']
dt_1 = DecisionTreeClassifier(max_depth=best_depth,min_samples_split=best_samples)
dt_1.fit(X_train, y_train)
# roc_auc_score(y_true, y_score) the 2nd parameter should be probability estimates of the positive class
# not the predicted outputs
+---------------+-----------------+----------------+----------------+---------------+
| Model | Train AUC Score | Test AUC Score | Train F1 Score | Test F1 Score |
+---------------+-----------------+----------------+----------------+---------------+
| Decision_Tree | 0.9967 | 0.9909 | 0.99314 | 0.9771 |
+---------------+-----------------+----------------+----------------+---------------+
# A parameter grid for XGBoost
parameters = {
'n_estimators': [100,500,1000],
'learning_rate': [0.1, 0.01, 0.05]
}
xgb = XGBClassifier(objective='binary:logistic',
silent=True, nthread=4)
xg_grid = GridSearchCV(xgb, param_grid=parameters, n_jobs=-1, verbose=1,scoring='f1_macro',cv=3,return_train_score=True)
training score: 0.9998006310038143
testing score: 0.990791808142236
feature_important = xg1.get_booster().get_score(importance_type='weight')
keys = list(feature_important.keys())
values = list(feature_important.values())
data = pd.DataFrame(data=values, index=keys, columns=["score"]).sort_values(by = "score", ascending=False)
provider_count 14960
State 5550
attend_physician_count 4258
County 3218
OPAnnualReimbursementAmt 816
total_diff_amount 800
ClmDiagnosisCode_1_count 713
OPAnnualDeductibleAmt 668
InscClaimAmtReimbursed 634