Skip to content

Instantly share code, notes, and snippets.

@PranjalDureja0002
Created March 15, 2021 10:13
Show Gist options
  • Save PranjalDureja0002/57ea91d224656b3a3710465709757ffd to your computer and use it in GitHub Desktop.
Save PranjalDureja0002/57ea91d224656b3a3710465709757ffd to your computer and use it in GitHub Desktop.
final
def final_fun_1(X):
""" function takes raw data as input,preprocessing is done,
feature engineering is performed and predictions made on the
best model already trained"""
d_beneficiary = pd.read_csv('health_cs_data/' + X[0])
d_inpatient = pd.read_csv('health_cs_data/' + X[1])
d_outpatient = pd.read_csv('health_cs_data/' + X[2])
d_labels = pd.read_csv('health_cs_data/' + X[3])
#Feature 'whether_admitted':: For inpatients the value will be 1 and for outpatients it will be 0
d_inpatient['whether_admitted'] = 1
d_outpatient['whether_admitted'] = 0
patient_data = pd.merge(d_inpatient,d_outpatient, left_on = [ col for col in d_outpatient.columns if col in d_inpatient.columns], \
right_on = [ col for col in d_outpatient.columns if col in d_inpatient.columns], how = 'outer')
patient_data = pd.merge(patient_data,d_beneficiary,how='inner',on='BeneID' ).\
merge(d_labels,how='outer',on='Provider')
#Features 'is_dead' and 'is_alive'::: For patients with Date_of_Death as NaN,it is 0, else 1
patient_data.loc[patient_data.DOD.notna(),'is_dead'] = 1
patient_data['is_dead']= 0
patient_data.loc[patient_data.DOD.notna(),'is_alive'] = 0
patient_data['is_alive']= 1
att_physician_count = patient_data['AttendingPhysician'].value_counts().to_dict()
patient_data['attend_physician_count']=patient_data['AttendingPhysician'].map(att_physician_count)
oper_physician_count = patient_data['OperatingPhysician'].value_counts().to_dict()
patient_data['operate_physician_count']=patient_data['OperatingPhysician'].map(oper_physician_count)
ben_count = patient_data['BeneID'].value_counts().to_dict()
patient_data['BeneID_count']=patient_data['BeneID'].map(ben_count)
prov_count = patient_data['Provider'].value_counts().to_dict()
patient_data['provider_count']=patient_data['Provider'].map(prov_count)
patient_data['Claim_Start'] = pd.to_datetime(patient_data['ClaimStartDt'] , format = '%Y-%m-%d')
patient_data['Claim_End'] = pd.to_datetime(patient_data['ClaimEndDt'],format = '%Y-%m-%d')
patient_data['DOB'] = pd.to_datetime(patient_data['DOB'] , format = '%Y-%m-%d')
patient_data['DOD'] = pd.to_datetime(patient_data['DOD'],format = '%Y-%m-%d')
patient_data['Claim_Days'] = ((patient_data['Claim_End'] - patient_data['Claim_Start']).dt.days) + 1
patient_data['Admission_Date'] = pd.to_datetime(patient_data['AdmissionDt'] , format = '%Y-%m-%d')
patient_data['Discharge_Date'] = pd.to_datetime(patient_data['DischargeDt'],format = '%Y-%m-%d')
patient_data['hospitalization_days'] = ((patient_data['Discharge_Date'] - patient_data['Admission_Date']).dt.days) + 1
reimb_amount = patient_data['IPAnnualReimbursementAmt'] + patient_data['OPAnnualReimbursementAmt']
deduct_amount = patient_data['IPAnnualDeductibleAmt'] + patient_data['OPAnnualDeductibleAmt']
patient_data['total_diff_amount'] = reimb_amount - deduct_amount
#include the top 7 diagnosis codes and 7 procedure codes as my 14 new features with a 0/1 value
diagnosis_codes = patient_data[['ClmDiagnosisCode_1', 'ClmDiagnosisCode_2', 'ClmDiagnosisCode_3',
'ClmDiagnosisCode_4', 'ClmDiagnosisCode_5', 'ClmDiagnosisCode_6',
'ClmDiagnosisCode_7', 'ClmDiagnosisCode_8', 'ClmDiagnosisCode_9',
'ClmDiagnosisCode_10']]
procedure_codes = patient_data[['ClmProcedureCode_1','ClmProcedureCode_2','ClmProcedureCode_3','ClmProcedureCode_4','ClmProcedureCode_5','ClmProcedureCode_6']]
Seven_diag_codes = ['4019','25000','2724','V5869','4011','42731','V5861'] # from EDA
patient_df = pd.DataFrame(columns = ['procedure'])
patient_df['procedure'] = pd.concat([patient_data["ClmProcedureCode_1"],patient_data["ClmProcedureCode_2"],patient_data["ClmProcedureCode_3"],patient_data["ClmProcedureCode_4"],patient_data["ClmProcedureCode_5"],patient_data["ClmProcedureCode_6"]],axis=0)
patient_df = patient_df.dropna()
Seven_proced_codes = ['4019.0','9904.0','2724.0','8154.0','66.0','3893.0','3995.0']
for i in Seven_proced_codes:
patient_data['Proc_Code_'+str(i)] = np.where(patient_data['ClmProcedureCode_1']==float(i),1,0)
patient_data['Proc_Code_'+str(i)] = np.where(patient_data['ClmProcedureCode_2']==float(i),1,np.where(patient_data['Proc_Code_'+str(i)]==1,1,0 ))
patient_data['Proc_Code_'+str(i)] = np.where(patient_data['ClmProcedureCode_3']==float(i),1,np.where(patient_data['Proc_Code_'+str(i)]==1,1,0 ))
patient_data['Proc_Code_'+str(i)] = np.where(patient_data['ClmProcedureCode_4']==float(i),1,np.where(patient_data['Proc_Code_'+str(i)]==1,1,0 ))
patient_data['Proc_Code_'+str(i)] = np.where(patient_data['ClmProcedureCode_5']==float(i),1,np.where(patient_data['Proc_Code_'+str(i)]==1,1,0 ))
patient_data['Proc_Code_'+str(i)] = np.where(patient_data['ClmProcedureCode_6']==float(i),1,np.where(patient_data['Proc_Code_'+str(i)]==1,1,0 ))
for i in Seven_diag_codes:
patient_data['Diag_Code_'+str(i)] = np.where(patient_data['ClmDiagnosisCode_1']==i,1,0)
patient_data['Diag_Code_'+str(i)] = np.where(patient_data['ClmDiagnosisCode_2']==i,1,np.where(patient_data['Diag_Code_'+str(i)]==1,1,0 ))
patient_data['Diag_Code_'+str(i)] = np.where(patient_data['ClmDiagnosisCode_3']==i,1,np.where(patient_data['Diag_Code_'+str(i)]==1,1,0 ))
patient_data['Diag_Code_'+str(i)] = np.where(patient_data['ClmDiagnosisCode_4']==i,1,np.where(patient_data['Diag_Code_'+str(i)]==1,1,0 ))
patient_data['Diag_Code_'+str(i)] = np.where(patient_data['ClmDiagnosisCode_5']==i,1,np.where(patient_data['Diag_Code_'+str(i)]==1,1,0 ))
patient_data['Diag_Code_'+str(i)] = np.where(patient_data['ClmDiagnosisCode_6']==i,1,np.where(patient_data['Diag_Code_'+str(i)]==1,1,0 ))
patient_data['Diag_Code_'+str(i)] = np.where(patient_data['ClmDiagnosisCode_7']==i,1,np.where(patient_data['Diag_Code_'+str(i)]==1,1,0 ))
patient_data['Diag_Code_'+str(i)] = np.where(patient_data['ClmDiagnosisCode_8']==i,1,np.where(patient_data['Diag_Code_'+str(i)]==1,1,0 ))
patient_data['Diag_Code_'+str(i)] = np.where(patient_data['ClmDiagnosisCode_9']==i,1,np.where(patient_data['Diag_Code_'+str(i)]==1,1,0 ))
patient_data['Diag_Code_'+str(i)] = np.where(patient_data['ClmDiagnosisCode_10']==i,1,np.where(patient_data['Diag_Code_'+str(i)]==1,1,0 ))
#adding 3 new features: is_primary,is_secondary,is_tertiary
#Attending_physician,Operating_physician,other_physician (imputation purpose).
patient_data['is_primary'] = np.where(patient_data['AttendingPhysician'].notnull(),1,0)
patient_data['is_secondary'] = np.where(patient_data['OperatingPhysician'].notnull(),1,0)
patient_data['is_tertiary'] = np.where(patient_data['OtherPhysician'].notnull(),1,0)
#Replace the y/0 in RenalDiseaseIndicator with 1/0
patient_data['RenalDiseaseIndicator'] = np.where(patient_data['RenalDiseaseIndicator']=='Y',1,0)
#Adding 6 new features which indicate the count of top 3 diagnosis and procedure codes
ClmProcedureCode_1_count = patient_data['ClmProcedureCode_1'].value_counts().to_dict()
patient_data['ClmProcedureCode_1_count']=patient_data['ClmProcedureCode_1'].map(ClmProcedureCode_1_count)
ClmProcedureCode_2_count = patient_data['ClmProcedureCode_2'].value_counts().to_dict()
patient_data['ClmProcedureCode_2_count']=patient_data['ClmProcedureCode_2'].map(ClmProcedureCode_2_count)
ClmProcedureCode_3_count = patient_data['ClmProcedureCode_3'].value_counts().to_dict()
patient_data['ClmProcedureCode_3_count']=patient_data['ClmProcedureCode_3'].map(ClmProcedureCode_3_count)
ClmDiagnosisCode_1_count = patient_data['ClmDiagnosisCode_1'].value_counts().to_dict()
patient_data['ClmDiagnosisCode_1_count']=patient_data['ClmDiagnosisCode_1'].map(ClmDiagnosisCode_1_count)
ClmDiagnosisCode_2_count = patient_data['ClmDiagnosisCode_2'].value_counts().to_dict()
patient_data['ClmDiagnosisCode_2_count']=patient_data['ClmDiagnosisCode_2'].map(ClmDiagnosisCode_2_count)
ClmDiagnosisCode_3_count = patient_data['ClmDiagnosisCode_3'].value_counts().to_dict()
patient_data['ClmDiagnosisCode_3_count']=patient_data['ClmDiagnosisCode_3'].map(ClmDiagnosisCode_3_count)
patient_data.fillna(0) #filling remaining nan values with 0
p_val = patient_data['Provider'].values #for output results
col_to_remove = ['Provider','BeneID', 'ClaimID', 'ClaimStartDt','ClaimEndDt','AttendingPhysician',\
'OperatingPhysician', 'OtherPhysician','ClmAdmitDiagnosisCode','NoOfMonths_PartACov',\
'NoOfMonths_PartBCov','DiagnosisGroupCode','AdmissionDt','DischargeDt']
diagnosis_codes = ['ClmDiagnosisCode_1', 'ClmDiagnosisCode_2', 'ClmDiagnosisCode_3',
'ClmDiagnosisCode_4', 'ClmDiagnosisCode_5', 'ClmDiagnosisCode_6',
'ClmDiagnosisCode_7', 'ClmDiagnosisCode_8', 'ClmDiagnosisCode_9',
'ClmDiagnosisCode_10']
procedure_codes = ['ClmProcedureCode_1','ClmProcedureCode_2','ClmProcedureCode_3','ClmProcedureCode_4','ClmProcedureCode_5','ClmProcedureCode_6']
oth_cols = ['DOB','DOD','Claim_Start','Claim_Start','Admission_Date','Admission_Date','Claim_End','Discharge_Date']
patient_data.drop(columns=col_to_remove, axis=1, inplace=True)
patient_data.drop(columns=diagnosis_codes, axis=1, inplace=True)
patient_data.drop(columns=procedure_codes, axis=1, inplace=True)
patient_data.drop(columns=oth_cols, axis=1, inplace=True)
patient_data.to_csv('patient_data_final.csv')
#min max scaling
min_max_scaler = preprocessing.MinMaxScaler()
def scale_fun(data,col):
min_max_scaler.fit(data[col].values.reshape(-1,1))
patient_data_scale=min_max_scaler.transform(data[col].values.reshape(-1,1))
return patient_data_scale
for col in patient_data.columns:
patient_data_scale= scale_fun(patient_data,col)
patient_data[col] = patient_data_scale
def predict_with_best_t(proba, threshold):
predictions = []
for i in proba:
if i>=threshold:
predictions.append(1)
else:
predictions.append(0)
return predictions
best_model = joblib.load('best_model.pkl')
# going with custom implementation of finding best threshold instead of using model.predict()
y_pred = best_model.predict_proba(patient_data)[:,1]
y_pred_th= predict_with_best_t(y_pred,threshold=0.331) #threshold computed for the best_model
patient_df = pd.DataFrame()
patient_df['providerID'] = p_val
patient_df['y_predicted'] = y_pred_th
return patient_df
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment