This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import pandas as pd | |
| data_df = pd.read_csv('../data/WA_Fn-UseC_-HR-Employee-Attrition.csv') | |
| print(data_df.shape) | |
| display(data_df.describe()) | |
| print(data_df.isnull().sum()) | |
| one_uniq = dict() | |
| for col in data_df.columns: | |
| if len(data_df[col].unique()) == 1: | |
| one_uniq.update({col: data_df[col].unique().tolist()}) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| def plot_categorical(df: pd.DataFrame , col:str): | |
| """ | |
| Function to plot the categorical data on piechart using Plotly | |
| @Args: | |
| df: pandas data frame | |
| col: A string column name within pandas data frame to plot | |
| Return: | |
| No object return, only visualization | |
| """ |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| _tmp_order = ['Bad', 'Good', 'Better', 'Best'] | |
| f, axes = plt.subplots(2, 2, figsize=(14, 14)) | |
| sns.countplot(x='WorkLifeBalance', hue='Attrition', palette={'Yes': 'r', 'No': 'skyblue'}, | |
| data=data_df, order=_tmp_order, ax=axes[0][0]) | |
| axes[0][0].set_title('Overall Work Life Balance') | |
| sns.boxplot(x='WorkLifeBalance', y='DistanceFromHome', hue='Attrition', | |
| palette={'Yes': 'r', 'No': 'skyblue'}, | |
| data=data_df, order=_tmp_order, ax=axes[0][1]) | |
| axes[0][1].set_title('Compare with distance from home') |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| def create_generation_feature(age_val: int) -> str: | |
| """ | |
| Function to convert age value onto generation string | |
| @Args: | |
| age_val (int): the age value from data frame | |
| Return: | |
| String output specifies the generation | |
| """ | |
| out = '' |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| plt.figure(figsize=(10, 6)) | |
| total_ = float(len(y_train)) | |
| ax = sns.countplot(y_train) | |
| for p in ax.patches: | |
| height = p.get_height() | |
| ax.text(p.get_x() + p.get_width() / 2., | |
| height + 10, | |
| '{0:1.1%}'.format(height / total_), | |
| ha='center') | |
| plt.title('Training label distribution', fontsize=16) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| def prediction_evaluation (algorithm, X_train, X_test, y_train, y_test, | |
| predictor_cols, cf = 'features'): | |
| """ | |
| Function to predict and evaluate the provided algorithm by using Plotly library | |
| to visualize the confusion matrix, ROC curve as well as provided the feature importances. | |
| @Args: | |
| algorithm: the model algorithm object | |
| X_train: the predictor features of the training pandas data frame | |
| X_test: the predictor features of the testing pandas data frame | |
| y_train: the target variable of the training pandas data frame |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| tree_clf = tree.DecisionTreeClassifier(random_state=SEED, max_depth=3) | |
| _ = myUtilityFunction.prediction_evaluation(tree_clf, X_train, X_test, | |
| y_train, y_test, X_train.columns, | |
| 'features') | |
| dot_data = tree.export_graphviz(tree_clf, out_file=None, | |
| feature_names=X_train.columns, | |
| class_names=['No', 'Yes'], | |
| filled=True, rounded=True, | |
| special_characters=True) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| xgb_clf = xgboost.XGBClassifier(random_state=SEED, n_jobs=-1, learning_rate=0.1, | |
| max_depth=3, n_estimators=100) | |
| _ = myUtilityFunction.prediction_evaluation(xgb_clf, X_train, X_test, | |
| y_train, y_test, X_train.columns, "features") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from sklearn.model_selection import RandomizedSearchCV | |
| xgb_clf = xgboost.XGBClassifier(random_state=SEED, n_jobs=-1) | |
| params = {'n_estimators': [50, 100, 200, 300], | |
| 'learning_rate': [0.01, 0.05, 0.1, 0.15], | |
| 'min_child_weight': [1, 2, 3, 5, 10], | |
| 'gamma': [0.1, 0.2, 0.3, 0.4, 0.5, 1], | |
| 'subsample': [0.6, 0.7, 0.8], | |
| 'colsample_bytree': [0.6, 0.7, 0.8], | |
| 'max_depth': [3, 4, 5], |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| cv_params = {'C': [0.001, 0.01, 0.1, 1., 10., 100.], | |
| 'penalty': ['l1', 'l2'], | |
| 'class_weight': [None, 'balanced'] | |
| } | |
| fix_params = {'random_state': SEED} | |
| log_cv_1 = GridSearchCV(LogisticRegression(**fix_params), cv_params, scoring='f1', cv=5) | |
| log_cv_1.fit(X_train, y_train) | |
| log_clf_all = LogisticRegression(**{**fix_params, **log_cv_1.best_params_}) | |
| _ = myUtilityFunction.prediction_evaluation(log_clf_all, X_train, X_test, y_train, y_test, | |
| X_train.columns, "coefficients") |
OlderNewer