Created
June 27, 2020 21:53
-
-
Save ianozsvald/f373f4278a303bfd5879293831298c45 to your computer and use it in GitHub Desktop.
Take bootstrap sample of array of items (e.g. strings) to calculate CI
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# take a bootstrap sample to calculate CI on counts of items | |
def bootstrap_sample_on_array(items, quantiles=[0.025, 0.975], n_bootstrap_samples = 1000): | |
all_counts = [] | |
for n in range(n_bootstrap_samples): | |
sample_ids = np.random.randint(low=0, high=items.shape[0], size=items.shape[0]) | |
sample = items[sample_ids] | |
uniq, cnts = np.unique(sample, return_counts=True) | |
c = dict(zip(uniq, cnts)) | |
all_counts.append(c) | |
all_counts = pd.DataFrame(all_counts).fillna(0) | |
#all_counts | |
return all_counts.quantile(quantiles) | |
items = ['a'] * 100 + ['b'] * 50 + ['c'] * 5 | |
items = np.array(items) | |
bootstrap_sample_on_array(items) # gives dataframe with 0.025 & 0.975 CI for each item | |
# equivalent to the following calculation | |
def calculate_ci_on_items(items): | |
uniq, cnts = np.unique(items, return_counts=True) | |
df_summary = pd.DataFrame(pd.Series(index=uniq, data=cnts)).rename(columns={0: 'counts'}) | |
prop = df_summary['counts'] / df_summary['counts'].sum() | |
ci_95 = np.sqrt((prop * (1-prop)) / ser.sum()) * 1.96 | |
df_summary = pd.DataFrame(ser, columns=['counts']) | |
df_summary['prop'] = prop | |
df_summary['prop_0.025'] = prop - ci_95 | |
df_summary['prop_0.975'] = prop + ci_95 | |
df_summary['0.025'] = df_summary['prop_0.025'] * df_summary['counts'].sum() | |
df_summary['0.975'] = df_summary['prop_0.975'] * df_summary['counts'].sum() | |
return df_summary.T.loc[['0.025', '0.975']] | |
calculate_ci_on_items(items) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment