Skip to content

Instantly share code, notes, and snippets.

@audhiaprilliant
Last active May 16, 2022 11:53
Show Gist options
  • Select an option

  • Save audhiaprilliant/8562626abe75ddf5b9a1ed31243cd3bf to your computer and use it in GitHub Desktop.

Select an option

Save audhiaprilliant/8562626abe75ddf5b9a1ed31243cd3bf to your computer and use it in GitHub Desktop.
End to end machine learning model deployment using flask
# -------------------- TRAINING SET --------------------
# Data frame metadata
df_train.info()
# Change column types
df_train = df_train.astype({'Credit_History': object, 'Loan_Status': int})
df_train.select_dtypes(include = ['object']).dtypes
# Summary statistics of categorical columns
for i in df_train.select_dtypes('object').columns:
print(df_train[i].value_counts(),'\n')
# Check missing values
df_train.isna().sum()
# Handle missing values
# 1 Dependents
print('Number of missing dependents is about {} rows'.format(df_train['Dependents'].isna().sum()))
# Replace missing valuess with "0"
df_train['Dependents'].fillna(value = '0', inplace = True)
# 2 Self_Employed
print('Number of missing Self_Employed is about {} rows'.format(df_train['Self_Employed'].isna().sum()))
# Replace missing values with "No"
df_train['Self_Employed'].fillna(value = 'No', inplace = True)
# 3 Loan_Amount_Term
df_train[['Loan_Amount_Term', 'Loan_Status']].groupby('Loan_Status').describe()
print('Percentile 20th: {}'.format(df_train['Loan_Amount_Term'].quantile(q = 0.2)))
# Replace missing values with "360"
df_train['Loan_Amount_Term'].fillna(value = 360, inplace = True)
# 4 Credit_History
# Cross tabulation of credit history and loan status
df_cred_hist = pd.crosstab(df_train['Credit_History'], df_train['Loan_Status'], margins = True).reset_index()
# Remove index name
df_cred_hist.columns.name = None
# Remove last row for total column attribute
df_cred_hist = df_cred_hist.drop([len(df_cred_hist) - 1], axis = 0)
df_cred_hist.rename(columns = {'Credit_History':'Credit History', 0:'No', 1:'Yes'}, inplace = True)
# Slice the data frame based on loan status
pos_cred_hist0 = df_train[(df_train['Credit_History'].isna()) & (df_train['Loan_Status'] == 0)]
pos_cred_hist1 = df_train[(df_train['Credit_History'].isna()) & (df_train['Loan_Status'] == 1)]
print('Number of rows with Loan_Status is No but Credit_History is NaN : {}'.format(len(pos_cred_hist0)))
print('Number of rows with Loan_Status is Yes but Credit_History is NaN : {}'.format(len(pos_cred_hist1)))
# Replace the missing values with a specific condition
credit_loan = zip(df_train['Credit_History'], df_train['Loan_Status'])
df_train['Credit_History'] = [
0.0 if np.isnan(credit) and status == 0 else
1.0 if np.isnan(credit) and status == 1 else
credit for credit, status in credit_loan
]
# 5 Gender and Loan Amount
# Drop missing values
df_train.dropna(axis = 0, how = 'any', inplace = True)
# Check missing value
df_train.isna().sum()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment