Skip to content

Instantly share code, notes, and snippets.

inpatient_data['Admission_Date'] = pd.to_datetime(inpatient_data['AdmissionDt'] , format = '%Y-%m-%d')
inpatient_data['Discharge_Date'] = pd.to_datetime(inpatient_data['DischargeDt'],format = '%Y-%m-%d')
inpatient_data['ClaimStart_Date'] = pd.to_datetime(inpatient_data['ClaimStartDt'] , format = '%Y-%m-%d')
inpatient_data['ClaimEnd_Date'] = pd.to_datetime(inpatient_data['ClaimEndDt'],format = '%Y-%m-%d')
inpatient_data['DOB'] = pd.to_datetime(inpatient_data['DOB'] , format = '%Y-%m-%d')
inpatient_data['DOD'] = pd.to_datetime(inpatient_data['DOD'],format = '%Y-%m-%d')
inpatient_data['Age'] = round(((inpatient_data['ClaimStart_Date'] - inpatient_data['DOB']).dt.days + 1)/365.25)
plt.figure(figsize=(15, 9))
sns.pointplot(x = outpatients_.physician_count, y = outpatients_.id_Count,hue=outpatients_.PotentialFraud)
plt.suptitle('Physician_attended vs Beneficiaries_count\n')
sns.pointplot(x = inpatients_.physician_count, y = inpatients_.id_Count,hue=inpatients_.PotentialFraud)
plt.suptitle('Physician_attended vs Beneficiaries_count\n')
plt.show()
physician_count = outpatients_['AttendingPhysician'].value_counts().to_dict()
outpatients_['physician_count']=outpatients_['AttendingPhysician'].map(physician_count)
physician_count = inpatients_['AttendingPhysician'].value_counts().to_dict()
inpatients_['physician_count']=inpatients_['AttendingPhysician'].map(physician_count)
ax = sns.countplot(y='AttendingPhysician',data=outpatients_,hue='PotentialFraud',order = outpatients_['AttendingPhysician'].value_counts().head(20).index)
d1 = outpatients_['AttendingPhysician'].value_counts().to_dict()
s_s1 = sum(list(d1.values()))
for p in ax.patches:
colors_list = ['#5bc0de','#d9534f']
plt.title("Potential Fraud distribution")
ax = patient_data['PotentialFraud'].value_counts().plot(kind='bar', figsize=(5,5), width=0.8,color = colors_list)
total = len(patient_data['PotentialFraud'])
for p in ax.patches:
percentage = '{:.1f}%'.format(100 * p.get_height()/total)
x = p.get_x() + p.get_width() - 0.5
y = p.get_y() + p.get_height()
ax.annotate(percentage, (x, y))
plt.xlabel('Potential_Fraud')
BeneID 0.000000
ClaimID 0.000000
ClaimStartDt 0.000000
ClaimEndDt 0.000000
Provider 0.000000
InscClaimAmtReimbursed 0.000000
AttendingPhysician 0.270149
OperatingPhysician 79.497538
OtherPhysician 64.218548
per_miss_values = patient_data.isnull().sum() * 100 / len(patient_data)
patient_data_io = pd.merge(train_d_inpatient, train_d_outpatient, left_on = [ col for col in train_d_outpatient.columns if col in train_d_inpatient.columns], \
right_on = [ col for col in train_d_outpatient.columns if col in train_d_inpatient.columns], how = 'outer')
patient_data = pd.merge(patient_data_io,train_d_beneficiary,how='inner',on='BeneID' ).\
merge(label_d_data,how='outer',on='Provider')
providerID y_predicted
0 PRV57070 0
1 PRV57070 1
2 PRV57070 0
3 PRV57070 0
4 PRV57070 0
5 PRV57070 1
6 PRV57070 0
7 PRV57070 1
8 PRV57070 0
def final_fun_1(X):
""" function takes raw data as input,preprocessing is done,
feature engineering is performed and predictions made on the
best model already trained"""
d_beneficiary = pd.read_csv('health_cs_data/' + X[0])
d_inpatient = pd.read_csv('health_cs_data/' + X[1])
d_outpatient = pd.read_csv('health_cs_data/' + X[2])
d_labels = pd.read_csv('health_cs_data/' + X[3])
def find_best_threshold(threshold, fpr, tpr):
t = threshold[np.argmax(tpr*(1-fpr))]
print("the maximum value of tpr*(1-fpr)", max(tpr*(1-fpr)), "for threshold", np.round(t,3))
return t
def predict_with_best_t(proba, threshold):
predictions = []
for i in proba:
if i>=threshold:
predictions.append(1)