Created
November 15, 2018 15:57
-
-
Save cjbayesian/05f35c78ffa37b4bebd6da8e96f4248e to your computer and use it in GitHub Desktop.
Plot cummulative distributions of multiple groups for comparison
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def cdf_diff(df, var, grp='label', col=None, rm_outlier=None, hard_lim=None, ax=None, xlim=None): | |
'''Plot cummulative distributions of multiple groups for comparison. | |
Arguments: | |
df: DataFrame | |
var: string, name of column to be plotted | |
grp: string, grouping variable | |
col: list, colors to use for each group | |
rm_outlier: None|float, remove datapoints beyond this many sigma. | |
ax: axis on which to plot. Default none will return a new figure | |
Examples: | |
cdf_diff(feats_labeled,var='Creatinine' ,rm_outlier=4.0) | |
fig, ax = plt.subplots(1, 2) | |
psLearn.cdf_diff(feats_labeled,var='Creatinine' ,ax=ax[0],rm_outlier=4.0) | |
psLearn.cdf_diff(feats_labeled,var='Sodium Level',ax=ax[1]) | |
''' | |
if col is None: | |
col = ['green', 'red'] | |
import statsmodels.api as sm | |
if ax is None: | |
fig, ax = plt.subplots(1, 1) | |
grps = df[grp].unique() | |
if len(df[var].unique()) == 2: | |
df.groupby(grp)[var].mean().plot(ax = ax,kind='bar',color=col) | |
ax.set_title(var) | |
else: | |
for g in grps: | |
sample = df[df[grp]==g][var] | |
sample = sample[np.isfinite(sample.values)] | |
if rm_outlier is not None: | |
sigma = sample.std() | |
mu = sample.mean() | |
sample = sample[sample > mu - rm_outlier * sigma ] | |
sample = sample[sample < mu + rm_outlier * sigma ] | |
if hard_lim is not None: | |
sample = sample[sample > hard_lim[0] ] | |
sample = sample[sample < hard_lim[1] ] | |
ecdf = sm.distributions.ECDF(sample) | |
sample = sample[ecdf(sample) < 0.99] | |
x = np.linspace(min(sample), max(sample), 1000) | |
y = ecdf(x) | |
#x = np.append(x, [max(sample)]) | |
#y = np.append(y, [0]) | |
ax.step(x, y,label='%s = %s' % (grp,str(g)),c=col[int(g)]) | |
ax.set_title(var) | |
ax.set_ylim([0,1]) | |
if xlim: | |
ax.set_xlim(xlim) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
cdf_diff
just does a single plot. In order to build a multi-panel array you can do something like: