Skip to content

Instantly share code, notes, and snippets.

@cjbayesian
Created November 15, 2018 15:57
Show Gist options
  • Save cjbayesian/05f35c78ffa37b4bebd6da8e96f4248e to your computer and use it in GitHub Desktop.
Save cjbayesian/05f35c78ffa37b4bebd6da8e96f4248e to your computer and use it in GitHub Desktop.
Plot cummulative distributions of multiple groups for comparison
def cdf_diff(df, var, grp='label', col=None, rm_outlier=None, hard_lim=None, ax=None, xlim=None):
'''Plot cummulative distributions of multiple groups for comparison.
Arguments:
df: DataFrame
var: string, name of column to be plotted
grp: string, grouping variable
col: list, colors to use for each group
rm_outlier: None|float, remove datapoints beyond this many sigma.
ax: axis on which to plot. Default none will return a new figure
Examples:
cdf_diff(feats_labeled,var='Creatinine' ,rm_outlier=4.0)
fig, ax = plt.subplots(1, 2)
psLearn.cdf_diff(feats_labeled,var='Creatinine' ,ax=ax[0],rm_outlier=4.0)
psLearn.cdf_diff(feats_labeled,var='Sodium Level',ax=ax[1])
'''
if col is None:
col = ['green', 'red']
import statsmodels.api as sm
if ax is None:
fig, ax = plt.subplots(1, 1)
grps = df[grp].unique()
if len(df[var].unique()) == 2:
df.groupby(grp)[var].mean().plot(ax = ax,kind='bar',color=col)
ax.set_title(var)
else:
for g in grps:
sample = df[df[grp]==g][var]
sample = sample[np.isfinite(sample.values)]
if rm_outlier is not None:
sigma = sample.std()
mu = sample.mean()
sample = sample[sample > mu - rm_outlier * sigma ]
sample = sample[sample < mu + rm_outlier * sigma ]
if hard_lim is not None:
sample = sample[sample > hard_lim[0] ]
sample = sample[sample < hard_lim[1] ]
ecdf = sm.distributions.ECDF(sample)
sample = sample[ecdf(sample) < 0.99]
x = np.linspace(min(sample), max(sample), 1000)
y = ecdf(x)
#x = np.append(x, [max(sample)])
#y = np.append(y, [0])
ax.step(x, y,label='%s = %s' % (grp,str(g)),c=col[int(g)])
ax.set_title(var)
ax.set_ylim([0,1])
if xlim:
ax.set_xlim(xlim)
@cjbayesian
Copy link
Author

cdf_diff just does a single plot. In order to build a multi-panel array you can do something like:

fig, axx = plt.subplots(5,5, figsize=(20,20))

for i, var in ['var1', 'var2', 'var3', ...]:
    ax = axx[int(i/5), i%5]
    cdf_diff(df, var, ax=ax)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment