This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
inpatient_data['Admission_Date'] = pd.to_datetime(inpatient_data['AdmissionDt'] , format = '%Y-%m-%d') | |
inpatient_data['Discharge_Date'] = pd.to_datetime(inpatient_data['DischargeDt'],format = '%Y-%m-%d') | |
inpatient_data['ClaimStart_Date'] = pd.to_datetime(inpatient_data['ClaimStartDt'] , format = '%Y-%m-%d') | |
inpatient_data['ClaimEnd_Date'] = pd.to_datetime(inpatient_data['ClaimEndDt'],format = '%Y-%m-%d') | |
inpatient_data['DOB'] = pd.to_datetime(inpatient_data['DOB'] , format = '%Y-%m-%d') | |
inpatient_data['DOD'] = pd.to_datetime(inpatient_data['DOD'],format = '%Y-%m-%d') | |
inpatient_data['Age'] = round(((inpatient_data['ClaimStart_Date'] - inpatient_data['DOB']).dt.days + 1)/365.25) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
plt.figure(figsize=(15, 9)) | |
sns.pointplot(x = outpatients_.physician_count, y = outpatients_.id_Count,hue=outpatients_.PotentialFraud) | |
plt.suptitle('Physician_attended vs Beneficiaries_count\n') | |
sns.pointplot(x = inpatients_.physician_count, y = inpatients_.id_Count,hue=inpatients_.PotentialFraud) | |
plt.suptitle('Physician_attended vs Beneficiaries_count\n') | |
plt.show() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
physician_count = outpatients_['AttendingPhysician'].value_counts().to_dict() | |
outpatients_['physician_count']=outpatients_['AttendingPhysician'].map(physician_count) | |
physician_count = inpatients_['AttendingPhysician'].value_counts().to_dict() | |
inpatients_['physician_count']=inpatients_['AttendingPhysician'].map(physician_count) | |
ax = sns.countplot(y='AttendingPhysician',data=outpatients_,hue='PotentialFraud',order = outpatients_['AttendingPhysician'].value_counts().head(20).index) | |
d1 = outpatients_['AttendingPhysician'].value_counts().to_dict() | |
s_s1 = sum(list(d1.values())) | |
for p in ax.patches: |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
colors_list = ['#5bc0de','#d9534f'] | |
plt.title("Potential Fraud distribution") | |
ax = patient_data['PotentialFraud'].value_counts().plot(kind='bar', figsize=(5,5), width=0.8,color = colors_list) | |
total = len(patient_data['PotentialFraud']) | |
for p in ax.patches: | |
percentage = '{:.1f}%'.format(100 * p.get_height()/total) | |
x = p.get_x() + p.get_width() - 0.5 | |
y = p.get_y() + p.get_height() | |
ax.annotate(percentage, (x, y)) | |
plt.xlabel('Potential_Fraud') |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
BeneID 0.000000 | |
ClaimID 0.000000 | |
ClaimStartDt 0.000000 | |
ClaimEndDt 0.000000 | |
Provider 0.000000 | |
InscClaimAmtReimbursed 0.000000 | |
AttendingPhysician 0.270149 | |
OperatingPhysician 79.497538 | |
OtherPhysician 64.218548 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
per_miss_values = patient_data.isnull().sum() * 100 / len(patient_data) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
patient_data_io = pd.merge(train_d_inpatient, train_d_outpatient, left_on = [ col for col in train_d_outpatient.columns if col in train_d_inpatient.columns], \ | |
right_on = [ col for col in train_d_outpatient.columns if col in train_d_inpatient.columns], how = 'outer') | |
patient_data = pd.merge(patient_data_io,train_d_beneficiary,how='inner',on='BeneID' ).\ | |
merge(label_d_data,how='outer',on='Provider') |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
providerID y_predicted | |
0 PRV57070 0 | |
1 PRV57070 1 | |
2 PRV57070 0 | |
3 PRV57070 0 | |
4 PRV57070 0 | |
5 PRV57070 1 | |
6 PRV57070 0 | |
7 PRV57070 1 | |
8 PRV57070 0 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def final_fun_1(X): | |
""" function takes raw data as input,preprocessing is done, | |
feature engineering is performed and predictions made on the | |
best model already trained""" | |
d_beneficiary = pd.read_csv('health_cs_data/' + X[0]) | |
d_inpatient = pd.read_csv('health_cs_data/' + X[1]) | |
d_outpatient = pd.read_csv('health_cs_data/' + X[2]) | |
d_labels = pd.read_csv('health_cs_data/' + X[3]) | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def find_best_threshold(threshold, fpr, tpr): | |
t = threshold[np.argmax(tpr*(1-fpr))] | |
print("the maximum value of tpr*(1-fpr)", max(tpr*(1-fpr)), "for threshold", np.round(t,3)) | |
return t | |
def predict_with_best_t(proba, threshold): | |
predictions = [] | |
for i in proba: | |
if i>=threshold: | |
predictions.append(1) |