Created
December 29, 2017 14:16
-
-
Save joseph-allen/9e89e627915e6ea291cf06d3af928299 to your computer and use it in GitHub Desktop.
Analysis Helpers
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import matplotlib.pyplot as plt | |
import seaborn as sns | |
import pandas as pd | |
from sklearn.tree import DecisionTreeClassifier | |
def plot_histograms(df, variables, n_rows, n_cols): | |
fig = plt.figure(figsize=(16, 12)) | |
for i, var_name in enumerate(variables): | |
ax = fig.add_subplot(n_rows, n_cols, i+1) | |
df[var_name].hist(bins=10, ax=ax) | |
ax.set_title('Skew: ' + str(round(float(df[var_name].skew()), ))) | |
ax.set_xticklabels([], visible=False) | |
ax.set_yticklabels([], visible=False) | |
fig.tight_layout() # Improves appearance a bit. | |
plt.show() | |
# good for plotting distributions | |
def plot_distribution(df, var, target, **kwargs): | |
row = kwargs.get('row', None) | |
col = kwargs.get('col', None) | |
facet = sns.FacetGrid(df, hue=target, aspect=4, row=row, col=col) | |
facet.map(sns.kdeplot, var, shade=True) | |
facet.set(xlim=(0, df[var].max())) | |
facet.add_legend() | |
# good for plotting categorical data | |
def plot_categories(df, cat, target, **kwargs): | |
row = kwargs.get('row', None) | |
col = kwargs.get('col', None) | |
facet = sns.FacetGrid(df, row=row, col=col) | |
facet.map(sns.barplot, cat, target) | |
facet.add_legend() | |
# plot correlations for seeing how similar features are | |
def plot_correlation_map(df): | |
corr = df.corr() | |
_, ax = plt.subplots(figsize=(12, 10)) | |
cmap = sns.diverging_palette(220, 10, as_cmap=True) | |
_ = sns.heatmap( | |
corr, | |
cmap=cmap, | |
square=True, | |
cbar_kws={'shrink': .9}, | |
ax=ax, | |
annot=True, | |
annot_kws={'fontsize': 12} | |
) | |
def describe_more(df): | |
var = [] | |
l = [] | |
t = [] | |
for x in df: | |
var.append(x) | |
l.append(len(pd.value_counts(df[x]))) | |
t.append(df[x].dtypes) | |
levels = pd.DataFrame({'Variable': var, 'Levels': l, 'Datatype': t}) | |
levels.sort_values(by='Levels', inplace=True) | |
return levels | |
def plot_variable_importance(X, y): | |
tree = DecisionTreeClassifier(random_state=99) | |
tree.fit(X, y) | |
plot_model_var_imp(tree, X, y) | |
def plot_model_var_imp(model, X, y): | |
imp = pd.DataFrame( | |
model.feature_importances_, | |
columns=['Importance'], | |
index=X.columns | |
) | |
imp = imp.sort_values(['Importance'], ascending=True) | |
imp[:10].plot(kind='barh') | |
print (model.score(X, y)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment