Created
December 1, 2019 22:08
-
-
Save cordon-thiago/4c244f3ff667bf2c62e74469ab2b557f to your computer and use it in GitHub Desktop.
Auxiliary functions for Hard bounce article.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import matplotlib.pyplot as plt | |
def plot_roc_curve(fpr, tpr, label=None): | |
""" | |
The ROC curve, modified from | |
Hands-On Machine learning with Scikit-Learn and TensorFlow; p.91 | |
tpr = true positive rate | |
fpr = false positive rate | |
""" | |
import matplotlib.pyplot as plt | |
import numpy as np | |
plt.figure(figsize=(8,8)) | |
plt.title('ROC Curve') | |
plt.plot(fpr, tpr, linewidth=2, label=label) | |
plt.plot([0, 1], [0, 1], 'k--') | |
plt.axis([-0.005, 1, 0, 1.005]) | |
plt.xticks(np.arange(0,1, 0.05), rotation=90) | |
plt.xlabel("False Positive Rate") | |
plt.ylabel("True Positive Rate (Recall)") | |
plt.legend(loc='best') | |
def plot_feature_importance(feature_imp, feature_imp_idx): | |
""" | |
Plot the feature importance graph | |
""" | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
fig, ax = plt.subplots(figsize=(10,8)) | |
# Creating a bar plot | |
sns.barplot(x=feature_imp, y=feature_imp.index, ax=ax) | |
# Add labels to your graph | |
plt.xlabel('Feature Importance Score') | |
plt.ylabel('Features') | |
plt.title("Visualizing Important Features") | |
plt.legend() | |
plt.show() | |
def plot_confusion_matrix(y_true, y_pred, classes, cmap, | |
normalize=False, | |
title=None): | |
""" | |
This function prints and plots the confusion matrix. | |
Normalization can be applied by setting `normalize=True`. | |
References: https://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html | |
""" | |
from sklearn import metrics | |
from sklearn.utils.multiclass import unique_labels | |
import matplotlib.pyplot as plt | |
import numpy as np | |
if not title: | |
if normalize: | |
title = 'Normalized confusion matrix' | |
else: | |
title = 'Confusion matrix, without normalization' | |
# Compute confusion matrix | |
cm = metrics.confusion_matrix(y_true, y_pred) | |
# Only use the labels that appear in the data | |
classes = classes[unique_labels(y_true, y_pred)] | |
if normalize: | |
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] | |
print("Normalized confusion matrix") | |
else: | |
print('Confusion matrix, without normalization') | |
print(cm) | |
fig, ax = plt.subplots() | |
im = ax.imshow(cm, interpolation='nearest', cmap=cmap) | |
ax.figure.colorbar(im, ax=ax) | |
# We want to show all ticks... | |
ax.set(xticks=np.arange(cm.shape[1]), | |
yticks=np.arange(cm.shape[0]), | |
# ... and label them with the respective list entries | |
xticklabels=classes, yticklabels=classes, | |
title=title, | |
ylabel='True label', | |
xlabel='Predicted label') | |
# Rotate the tick labels and set their alignment. | |
plt.setp(ax.get_xticklabels(), rotation=45, ha="right", | |
rotation_mode="anchor") | |
# Loop over data dimensions and create text annotations. | |
fmt = '.2f' if normalize else 'd' | |
thresh = cm.max() / 2. | |
for i in range(cm.shape[0]): | |
for j in range(cm.shape[1]): | |
ax.text(j, i, format(cm[i, j], fmt), | |
ha="center", va="center", | |
color="white" if cm[i, j] > thresh else "black") | |
fig.tight_layout() | |
title = 'Accuracy Score: {0}'.format(metrics.accuracy_score(y_true, y_pred)) | |
plt.title(title, size = 15) | |
return ax | |
def freqTable(lines_list, cols_list, show_total, perc_type=None): | |
''' | |
lines_list: df column list for exhibition in lines of frequency table. Ex: [df["col1"], df["col2"]] | |
cols_list: df column list for exhibition in columns of frequency table. Ex: [df["col1"], df["col2"]] | |
show_total: Show totals? True | False | |
perc_type: "columns" for column percentage; "index" for row percentage or None for absolute value | |
References: https://pbpython.com/pandas-crosstab.html | |
''' | |
import pandas as pd | |
if (perc_type==None): | |
freq = pd.crosstab( | |
index=lines_list | |
,columns=cols_list | |
,margins=show_total | |
,margins_name="Total" | |
) | |
else: | |
freq = pd.crosstab( | |
index=lines_list | |
,columns=cols_list | |
,margins=show_total | |
,margins_name="Total" | |
,normalize=perc_type | |
) | |
return freq | |
def percMissing(df): | |
''' | |
Verify Missing columns | |
Input: DataFrame | |
''' | |
import pandas as pd | |
# Create Series object | |
s = pd.Series() | |
# Check each column | |
for col in df.columns: | |
# Fill series object (index is the column name and the value is the % of missing rows) | |
# the count() funcion does not return missing | |
s.at[col] = ((len(df) - df[col].count()) / len(df)) * 100 | |
# Create dataframe with results | |
df_missing = pd.DataFrame({'col':s.index, 'perc_missing':s.values}) | |
return df_missing.sort_values(by=['perc_missing'], ascending=False) | |
def getEmailDomain(email): | |
''' | |
Extracts email domain | |
Input: email string | |
''' | |
return email.split('@')[1].lower() | |
def getPiece1EmailDomain(domain): | |
''' | |
Extracts piece 1 from email domain | |
Example: [email protected] will return '.com' string | |
Input: email domain string | |
''' | |
if (len(domain.split('.')) == 2): | |
return domain.split('.')[-1] | |
elif (len(domain.split('.')) >= 3): | |
return domain.split('.')[-2] | |
else: | |
return 'missing' | |
def getPiece2EmailDomain(domain): | |
''' | |
Extracts piece 2 from email domain | |
Example: [email protected] will return '.br' string | |
Input: email domain string | |
''' | |
if (len(domain.split('.')) >= 3): | |
return domain.split('.')[-1] | |
else: | |
return 'missing' | |
def getEmailUser(email): | |
''' | |
Extracts email user from email | |
Example: [email protected] will return 'teste' string | |
Input: email string | |
''' | |
return email.split('@')[0].lower() | |
def percentageNumberInStr(string): | |
''' | |
Calculates the percentage of numbers contained in a string | |
Input: string | |
''' | |
count = 0 | |
if len(string) > 0: | |
for char in string: | |
if char in ['0','1','2','3','4','5','6','7','8','9']: | |
count += 1 | |
return count/len(string) | |
else: | |
return 0 | |
def oversampleSMOTE(X, y): | |
''' | |
Resample a dataset using SMOTE oversample | |
Input: | |
X = dataframe with x variables (explanatory variables) | |
y = dataframe with y variable (variable to predict) | |
Output: | |
df[0] = X dataframe resampled | |
df[1] = y dataframe resampled | |
''' | |
from imblearn.over_sampling import SMOTE | |
import pandas as pd | |
sm = SMOTE(random_state=123) | |
X_resampled, y_resampled = sm.fit_resample(X, y.ravel()) | |
# Get column names | |
X_cols = X.columns.values | |
y_cols = [y.name] | |
return pd.DataFrame(X_resampled, columns=X_cols) , pd.DataFrame(y_resampled, columns=y_cols) | |
def buildDummyVariables(df, category_vars): | |
''' | |
Return a DF with categoric variables dummized | |
Input: | |
df = dataframe that contains the variables to be dummized | |
category_vars = list of variables to be dummized | |
Output: | |
df = X dataframe with the dummy variables, excluding the original categoric variables | |
''' | |
import pandas as pd | |
df_new = df.copy() | |
# Build new DF with dummy variables | |
for var in category_vars: | |
cat_list = 'var' + '_' + var | |
cat_list = pd.get_dummies(df[var], prefix=var) | |
df_new = df_new.join(cat_list) | |
# build a list of columns to keep, excluding the original category variables | |
df_vars = df_new.columns.values.tolist() | |
to_keep = [i for i in df_vars if i not in category_vars] | |
# return new df with new dummt columns, excluding the original categoric variables | |
return df_new[to_keep] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment