Last active
May 16, 2022 11:53
-
-
Save audhiaprilliant/8562626abe75ddf5b9a1ed31243cd3bf to your computer and use it in GitHub Desktop.
End to end machine learning model deployment using flask
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # -------------------- TRAINING SET -------------------- | |
| # Data frame metadata | |
| df_train.info() | |
| # Change column types | |
| df_train = df_train.astype({'Credit_History': object, 'Loan_Status': int}) | |
| df_train.select_dtypes(include = ['object']).dtypes | |
| # Summary statistics of categorical columns | |
| for i in df_train.select_dtypes('object').columns: | |
| print(df_train[i].value_counts(),'\n') | |
| # Check missing values | |
| df_train.isna().sum() | |
| # Handle missing values | |
| # 1 Dependents | |
| print('Number of missing dependents is about {} rows'.format(df_train['Dependents'].isna().sum())) | |
| # Replace missing valuess with "0" | |
| df_train['Dependents'].fillna(value = '0', inplace = True) | |
| # 2 Self_Employed | |
| print('Number of missing Self_Employed is about {} rows'.format(df_train['Self_Employed'].isna().sum())) | |
| # Replace missing values with "No" | |
| df_train['Self_Employed'].fillna(value = 'No', inplace = True) | |
| # 3 Loan_Amount_Term | |
| df_train[['Loan_Amount_Term', 'Loan_Status']].groupby('Loan_Status').describe() | |
| print('Percentile 20th: {}'.format(df_train['Loan_Amount_Term'].quantile(q = 0.2))) | |
| # Replace missing values with "360" | |
| df_train['Loan_Amount_Term'].fillna(value = 360, inplace = True) | |
| # 4 Credit_History | |
| # Cross tabulation of credit history and loan status | |
| df_cred_hist = pd.crosstab(df_train['Credit_History'], df_train['Loan_Status'], margins = True).reset_index() | |
| # Remove index name | |
| df_cred_hist.columns.name = None | |
| # Remove last row for total column attribute | |
| df_cred_hist = df_cred_hist.drop([len(df_cred_hist) - 1], axis = 0) | |
| df_cred_hist.rename(columns = {'Credit_History':'Credit History', 0:'No', 1:'Yes'}, inplace = True) | |
| # Slice the data frame based on loan status | |
| pos_cred_hist0 = df_train[(df_train['Credit_History'].isna()) & (df_train['Loan_Status'] == 0)] | |
| pos_cred_hist1 = df_train[(df_train['Credit_History'].isna()) & (df_train['Loan_Status'] == 1)] | |
| print('Number of rows with Loan_Status is No but Credit_History is NaN : {}'.format(len(pos_cred_hist0))) | |
| print('Number of rows with Loan_Status is Yes but Credit_History is NaN : {}'.format(len(pos_cred_hist1))) | |
| # Replace the missing values with a specific condition | |
| credit_loan = zip(df_train['Credit_History'], df_train['Loan_Status']) | |
| df_train['Credit_History'] = [ | |
| 0.0 if np.isnan(credit) and status == 0 else | |
| 1.0 if np.isnan(credit) and status == 1 else | |
| credit for credit, status in credit_loan | |
| ] | |
| # 5 Gender and Loan Amount | |
| # Drop missing values | |
| df_train.dropna(axis = 0, how = 'any', inplace = True) | |
| # Check missing value | |
| df_train.isna().sum() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment