This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
data_df = pd.read_csv('../data/WA_Fn-UseC_-HR-Employee-Attrition.csv') | |
print(data_df.shape) | |
display(data_df.describe()) | |
print(data_df.isnull().sum()) | |
one_uniq = dict() | |
for col in data_df.columns: | |
if len(data_df[col].unique()) == 1: | |
one_uniq.update({col: data_df[col].unique().tolist()}) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def plot_categorical(df: pd.DataFrame , col:str): | |
""" | |
Function to plot the categorical data on piechart using Plotly | |
@Args: | |
df: pandas data frame | |
col: A string column name within pandas data frame to plot | |
Return: | |
No object return, only visualization | |
""" |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
_tmp_order = ['Bad', 'Good', 'Better', 'Best'] | |
f, axes = plt.subplots(2, 2, figsize=(14, 14)) | |
sns.countplot(x='WorkLifeBalance', hue='Attrition', palette={'Yes': 'r', 'No': 'skyblue'}, | |
data=data_df, order=_tmp_order, ax=axes[0][0]) | |
axes[0][0].set_title('Overall Work Life Balance') | |
sns.boxplot(x='WorkLifeBalance', y='DistanceFromHome', hue='Attrition', | |
palette={'Yes': 'r', 'No': 'skyblue'}, | |
data=data_df, order=_tmp_order, ax=axes[0][1]) | |
axes[0][1].set_title('Compare with distance from home') |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def create_generation_feature(age_val: int) -> str: | |
""" | |
Function to convert age value onto generation string | |
@Args: | |
age_val (int): the age value from data frame | |
Return: | |
String output specifies the generation | |
""" | |
out = '' |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
plt.figure(figsize=(10, 6)) | |
total_ = float(len(y_train)) | |
ax = sns.countplot(y_train) | |
for p in ax.patches: | |
height = p.get_height() | |
ax.text(p.get_x() + p.get_width() / 2., | |
height + 10, | |
'{0:1.1%}'.format(height / total_), | |
ha='center') | |
plt.title('Training label distribution', fontsize=16) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def prediction_evaluation (algorithm, X_train, X_test, y_train, y_test, | |
predictor_cols, cf = 'features'): | |
""" | |
Function to predict and evaluate the provided algorithm by using Plotly library | |
to visualize the confusion matrix, ROC curve as well as provided the feature importances. | |
@Args: | |
algorithm: the model algorithm object | |
X_train: the predictor features of the training pandas data frame | |
X_test: the predictor features of the testing pandas data frame | |
y_train: the target variable of the training pandas data frame |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
tree_clf = tree.DecisionTreeClassifier(random_state=SEED, max_depth=3) | |
_ = myUtilityFunction.prediction_evaluation(tree_clf, X_train, X_test, | |
y_train, y_test, X_train.columns, | |
'features') | |
dot_data = tree.export_graphviz(tree_clf, out_file=None, | |
feature_names=X_train.columns, | |
class_names=['No', 'Yes'], | |
filled=True, rounded=True, | |
special_characters=True) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
xgb_clf = xgboost.XGBClassifier(random_state=SEED, n_jobs=-1, learning_rate=0.1, | |
max_depth=3, n_estimators=100) | |
_ = myUtilityFunction.prediction_evaluation(xgb_clf, X_train, X_test, | |
y_train, y_test, X_train.columns, "features") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.model_selection import RandomizedSearchCV | |
xgb_clf = xgboost.XGBClassifier(random_state=SEED, n_jobs=-1) | |
params = {'n_estimators': [50, 100, 200, 300], | |
'learning_rate': [0.01, 0.05, 0.1, 0.15], | |
'min_child_weight': [1, 2, 3, 5, 10], | |
'gamma': [0.1, 0.2, 0.3, 0.4, 0.5, 1], | |
'subsample': [0.6, 0.7, 0.8], | |
'colsample_bytree': [0.6, 0.7, 0.8], | |
'max_depth': [3, 4, 5], |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
cv_params = {'C': [0.001, 0.01, 0.1, 1., 10., 100.], | |
'penalty': ['l1', 'l2'], | |
'class_weight': [None, 'balanced'] | |
} | |
fix_params = {'random_state': SEED} | |
log_cv_1 = GridSearchCV(LogisticRegression(**fix_params), cv_params, scoring='f1', cv=5) | |
log_cv_1.fit(X_train, y_train) | |
log_clf_all = LogisticRegression(**{**fix_params, **log_cv_1.best_params_}) | |
_ = myUtilityFunction.prediction_evaluation(log_clf_all, X_train, X_test, y_train, y_test, | |
X_train.columns, "coefficients") |
OlderNewer