Created
March 15, 2021 10:13
-
-
Save PranjalDureja0002/57ea91d224656b3a3710465709757ffd to your computer and use it in GitHub Desktop.
final
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def final_fun_1(X): | |
""" function takes raw data as input,preprocessing is done, | |
feature engineering is performed and predictions made on the | |
best model already trained""" | |
d_beneficiary = pd.read_csv('health_cs_data/' + X[0]) | |
d_inpatient = pd.read_csv('health_cs_data/' + X[1]) | |
d_outpatient = pd.read_csv('health_cs_data/' + X[2]) | |
d_labels = pd.read_csv('health_cs_data/' + X[3]) | |
#Feature 'whether_admitted':: For inpatients the value will be 1 and for outpatients it will be 0 | |
d_inpatient['whether_admitted'] = 1 | |
d_outpatient['whether_admitted'] = 0 | |
patient_data = pd.merge(d_inpatient,d_outpatient, left_on = [ col for col in d_outpatient.columns if col in d_inpatient.columns], \ | |
right_on = [ col for col in d_outpatient.columns if col in d_inpatient.columns], how = 'outer') | |
patient_data = pd.merge(patient_data,d_beneficiary,how='inner',on='BeneID' ).\ | |
merge(d_labels,how='outer',on='Provider') | |
#Features 'is_dead' and 'is_alive'::: For patients with Date_of_Death as NaN,it is 0, else 1 | |
patient_data.loc[patient_data.DOD.notna(),'is_dead'] = 1 | |
patient_data['is_dead']= 0 | |
patient_data.loc[patient_data.DOD.notna(),'is_alive'] = 0 | |
patient_data['is_alive']= 1 | |
att_physician_count = patient_data['AttendingPhysician'].value_counts().to_dict() | |
patient_data['attend_physician_count']=patient_data['AttendingPhysician'].map(att_physician_count) | |
oper_physician_count = patient_data['OperatingPhysician'].value_counts().to_dict() | |
patient_data['operate_physician_count']=patient_data['OperatingPhysician'].map(oper_physician_count) | |
ben_count = patient_data['BeneID'].value_counts().to_dict() | |
patient_data['BeneID_count']=patient_data['BeneID'].map(ben_count) | |
prov_count = patient_data['Provider'].value_counts().to_dict() | |
patient_data['provider_count']=patient_data['Provider'].map(prov_count) | |
patient_data['Claim_Start'] = pd.to_datetime(patient_data['ClaimStartDt'] , format = '%Y-%m-%d') | |
patient_data['Claim_End'] = pd.to_datetime(patient_data['ClaimEndDt'],format = '%Y-%m-%d') | |
patient_data['DOB'] = pd.to_datetime(patient_data['DOB'] , format = '%Y-%m-%d') | |
patient_data['DOD'] = pd.to_datetime(patient_data['DOD'],format = '%Y-%m-%d') | |
patient_data['Claim_Days'] = ((patient_data['Claim_End'] - patient_data['Claim_Start']).dt.days) + 1 | |
patient_data['Admission_Date'] = pd.to_datetime(patient_data['AdmissionDt'] , format = '%Y-%m-%d') | |
patient_data['Discharge_Date'] = pd.to_datetime(patient_data['DischargeDt'],format = '%Y-%m-%d') | |
patient_data['hospitalization_days'] = ((patient_data['Discharge_Date'] - patient_data['Admission_Date']).dt.days) + 1 | |
reimb_amount = patient_data['IPAnnualReimbursementAmt'] + patient_data['OPAnnualReimbursementAmt'] | |
deduct_amount = patient_data['IPAnnualDeductibleAmt'] + patient_data['OPAnnualDeductibleAmt'] | |
patient_data['total_diff_amount'] = reimb_amount - deduct_amount | |
#include the top 7 diagnosis codes and 7 procedure codes as my 14 new features with a 0/1 value | |
diagnosis_codes = patient_data[['ClmDiagnosisCode_1', 'ClmDiagnosisCode_2', 'ClmDiagnosisCode_3', | |
'ClmDiagnosisCode_4', 'ClmDiagnosisCode_5', 'ClmDiagnosisCode_6', | |
'ClmDiagnosisCode_7', 'ClmDiagnosisCode_8', 'ClmDiagnosisCode_9', | |
'ClmDiagnosisCode_10']] | |
procedure_codes = patient_data[['ClmProcedureCode_1','ClmProcedureCode_2','ClmProcedureCode_3','ClmProcedureCode_4','ClmProcedureCode_5','ClmProcedureCode_6']] | |
Seven_diag_codes = ['4019','25000','2724','V5869','4011','42731','V5861'] # from EDA | |
patient_df = pd.DataFrame(columns = ['procedure']) | |
patient_df['procedure'] = pd.concat([patient_data["ClmProcedureCode_1"],patient_data["ClmProcedureCode_2"],patient_data["ClmProcedureCode_3"],patient_data["ClmProcedureCode_4"],patient_data["ClmProcedureCode_5"],patient_data["ClmProcedureCode_6"]],axis=0) | |
patient_df = patient_df.dropna() | |
Seven_proced_codes = ['4019.0','9904.0','2724.0','8154.0','66.0','3893.0','3995.0'] | |
for i in Seven_proced_codes: | |
patient_data['Proc_Code_'+str(i)] = np.where(patient_data['ClmProcedureCode_1']==float(i),1,0) | |
patient_data['Proc_Code_'+str(i)] = np.where(patient_data['ClmProcedureCode_2']==float(i),1,np.where(patient_data['Proc_Code_'+str(i)]==1,1,0 )) | |
patient_data['Proc_Code_'+str(i)] = np.where(patient_data['ClmProcedureCode_3']==float(i),1,np.where(patient_data['Proc_Code_'+str(i)]==1,1,0 )) | |
patient_data['Proc_Code_'+str(i)] = np.where(patient_data['ClmProcedureCode_4']==float(i),1,np.where(patient_data['Proc_Code_'+str(i)]==1,1,0 )) | |
patient_data['Proc_Code_'+str(i)] = np.where(patient_data['ClmProcedureCode_5']==float(i),1,np.where(patient_data['Proc_Code_'+str(i)]==1,1,0 )) | |
patient_data['Proc_Code_'+str(i)] = np.where(patient_data['ClmProcedureCode_6']==float(i),1,np.where(patient_data['Proc_Code_'+str(i)]==1,1,0 )) | |
for i in Seven_diag_codes: | |
patient_data['Diag_Code_'+str(i)] = np.where(patient_data['ClmDiagnosisCode_1']==i,1,0) | |
patient_data['Diag_Code_'+str(i)] = np.where(patient_data['ClmDiagnosisCode_2']==i,1,np.where(patient_data['Diag_Code_'+str(i)]==1,1,0 )) | |
patient_data['Diag_Code_'+str(i)] = np.where(patient_data['ClmDiagnosisCode_3']==i,1,np.where(patient_data['Diag_Code_'+str(i)]==1,1,0 )) | |
patient_data['Diag_Code_'+str(i)] = np.where(patient_data['ClmDiagnosisCode_4']==i,1,np.where(patient_data['Diag_Code_'+str(i)]==1,1,0 )) | |
patient_data['Diag_Code_'+str(i)] = np.where(patient_data['ClmDiagnosisCode_5']==i,1,np.where(patient_data['Diag_Code_'+str(i)]==1,1,0 )) | |
patient_data['Diag_Code_'+str(i)] = np.where(patient_data['ClmDiagnosisCode_6']==i,1,np.where(patient_data['Diag_Code_'+str(i)]==1,1,0 )) | |
patient_data['Diag_Code_'+str(i)] = np.where(patient_data['ClmDiagnosisCode_7']==i,1,np.where(patient_data['Diag_Code_'+str(i)]==1,1,0 )) | |
patient_data['Diag_Code_'+str(i)] = np.where(patient_data['ClmDiagnosisCode_8']==i,1,np.where(patient_data['Diag_Code_'+str(i)]==1,1,0 )) | |
patient_data['Diag_Code_'+str(i)] = np.where(patient_data['ClmDiagnosisCode_9']==i,1,np.where(patient_data['Diag_Code_'+str(i)]==1,1,0 )) | |
patient_data['Diag_Code_'+str(i)] = np.where(patient_data['ClmDiagnosisCode_10']==i,1,np.where(patient_data['Diag_Code_'+str(i)]==1,1,0 )) | |
#adding 3 new features: is_primary,is_secondary,is_tertiary | |
#Attending_physician,Operating_physician,other_physician (imputation purpose). | |
patient_data['is_primary'] = np.where(patient_data['AttendingPhysician'].notnull(),1,0) | |
patient_data['is_secondary'] = np.where(patient_data['OperatingPhysician'].notnull(),1,0) | |
patient_data['is_tertiary'] = np.where(patient_data['OtherPhysician'].notnull(),1,0) | |
#Replace the y/0 in RenalDiseaseIndicator with 1/0 | |
patient_data['RenalDiseaseIndicator'] = np.where(patient_data['RenalDiseaseIndicator']=='Y',1,0) | |
#Adding 6 new features which indicate the count of top 3 diagnosis and procedure codes | |
ClmProcedureCode_1_count = patient_data['ClmProcedureCode_1'].value_counts().to_dict() | |
patient_data['ClmProcedureCode_1_count']=patient_data['ClmProcedureCode_1'].map(ClmProcedureCode_1_count) | |
ClmProcedureCode_2_count = patient_data['ClmProcedureCode_2'].value_counts().to_dict() | |
patient_data['ClmProcedureCode_2_count']=patient_data['ClmProcedureCode_2'].map(ClmProcedureCode_2_count) | |
ClmProcedureCode_3_count = patient_data['ClmProcedureCode_3'].value_counts().to_dict() | |
patient_data['ClmProcedureCode_3_count']=patient_data['ClmProcedureCode_3'].map(ClmProcedureCode_3_count) | |
ClmDiagnosisCode_1_count = patient_data['ClmDiagnosisCode_1'].value_counts().to_dict() | |
patient_data['ClmDiagnosisCode_1_count']=patient_data['ClmDiagnosisCode_1'].map(ClmDiagnosisCode_1_count) | |
ClmDiagnosisCode_2_count = patient_data['ClmDiagnosisCode_2'].value_counts().to_dict() | |
patient_data['ClmDiagnosisCode_2_count']=patient_data['ClmDiagnosisCode_2'].map(ClmDiagnosisCode_2_count) | |
ClmDiagnosisCode_3_count = patient_data['ClmDiagnosisCode_3'].value_counts().to_dict() | |
patient_data['ClmDiagnosisCode_3_count']=patient_data['ClmDiagnosisCode_3'].map(ClmDiagnosisCode_3_count) | |
patient_data.fillna(0) #filling remaining nan values with 0 | |
p_val = patient_data['Provider'].values #for output results | |
col_to_remove = ['Provider','BeneID', 'ClaimID', 'ClaimStartDt','ClaimEndDt','AttendingPhysician',\ | |
'OperatingPhysician', 'OtherPhysician','ClmAdmitDiagnosisCode','NoOfMonths_PartACov',\ | |
'NoOfMonths_PartBCov','DiagnosisGroupCode','AdmissionDt','DischargeDt'] | |
diagnosis_codes = ['ClmDiagnosisCode_1', 'ClmDiagnosisCode_2', 'ClmDiagnosisCode_3', | |
'ClmDiagnosisCode_4', 'ClmDiagnosisCode_5', 'ClmDiagnosisCode_6', | |
'ClmDiagnosisCode_7', 'ClmDiagnosisCode_8', 'ClmDiagnosisCode_9', | |
'ClmDiagnosisCode_10'] | |
procedure_codes = ['ClmProcedureCode_1','ClmProcedureCode_2','ClmProcedureCode_3','ClmProcedureCode_4','ClmProcedureCode_5','ClmProcedureCode_6'] | |
oth_cols = ['DOB','DOD','Claim_Start','Claim_Start','Admission_Date','Admission_Date','Claim_End','Discharge_Date'] | |
patient_data.drop(columns=col_to_remove, axis=1, inplace=True) | |
patient_data.drop(columns=diagnosis_codes, axis=1, inplace=True) | |
patient_data.drop(columns=procedure_codes, axis=1, inplace=True) | |
patient_data.drop(columns=oth_cols, axis=1, inplace=True) | |
patient_data.to_csv('patient_data_final.csv') | |
#min max scaling | |
min_max_scaler = preprocessing.MinMaxScaler() | |
def scale_fun(data,col): | |
min_max_scaler.fit(data[col].values.reshape(-1,1)) | |
patient_data_scale=min_max_scaler.transform(data[col].values.reshape(-1,1)) | |
return patient_data_scale | |
for col in patient_data.columns: | |
patient_data_scale= scale_fun(patient_data,col) | |
patient_data[col] = patient_data_scale | |
def predict_with_best_t(proba, threshold): | |
predictions = [] | |
for i in proba: | |
if i>=threshold: | |
predictions.append(1) | |
else: | |
predictions.append(0) | |
return predictions | |
best_model = joblib.load('best_model.pkl') | |
# going with custom implementation of finding best threshold instead of using model.predict() | |
y_pred = best_model.predict_proba(patient_data)[:,1] | |
y_pred_th= predict_with_best_t(y_pred,threshold=0.331) #threshold computed for the best_model | |
patient_df = pd.DataFrame() | |
patient_df['providerID'] = p_val | |
patient_df['y_predicted'] = y_pred_th | |
return patient_df |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment