Created
May 21, 2020 05:09
-
-
Save evgenyneu/f4961bae1a48279b80be603b8b0649c0 to your computer and use it in GitHub Desktop.
Calculate probability distribution of black marbles in a bag, given observed proportion in the sample form the bag
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
from scipy import stats | |
import matplotlib.pyplot as plt | |
def find_posterior(M, N, p, p_hpdi): | |
""" | |
We have a bag containing black and white marbles, `M` in total. | |
We don't know how many black marbles are in the bag. | |
We take `N` from the bag and count the proportion of black marbles `p`. | |
Note after taking a marble, we keep it, we do not return it back | |
into the bag. | |
This function estimates probability distribution of the proportion | |
of black marbles in the bag. | |
Parameters | |
-------- | |
M: int | |
Total number of marbles. | |
N: int | |
Number of marbles drawn from the bag | |
(without replacement, i.e. we keep each marble and not return it | |
back into the bag). | |
p: float | |
Proportion of black marbles drawn from the bag | |
(i.e. observed success rate). | |
p_hpdi: float | |
The probability of the HPDI interval we want to calculate | |
from the distribution. | |
""" | |
# Number of black marbles that we drawn (i.e. number of successes) | |
k = round(p * N) | |
# Grid containing different possibilities of the | |
# number of black marbles in the bag: i.e. n = 0, 1, 2, ..., M | |
n_grid = np.arange(0, M + 1) | |
# Calculate likelihoods of observing the data | |
# for different number of black marbles in the bag: n = 0, 1, 2, ... M | |
likelihood = [ | |
stats.hypergeom.pmf(M=M, n=n, N=N, k=k) | |
for n in n_grid | |
] | |
# Normalise the likelihood to get probability distribution that sums to 1 | |
posterior = likelihood / np.sum(likelihood) | |
# Grid containing the proportion of successes (black marbles) | |
p_grid = n_grid / M | |
# Mode | |
mode = p_grid[np.argmax(posterior)] | |
mode_str = f'Mode = {mode:.3f}' | |
print(mode_str) | |
# Calculate HPDI | |
hpdi = find_hpdi(probability_density=posterior, p=p_hpdi) | |
p_inteval = (p_grid[hpdi[0]], p_grid[hpdi[1]]) | |
hpdi_label = f'{p_hpdi * 100:.1f}% HPDI' | |
interval_str = f'{hpdi_label} = ({p_inteval[0]:.3f}, {p_inteval[1]:.3f})' | |
print(interval_str) | |
# Plot the posterior distribution, its mode and HPDI | |
plt.plot(p_grid, posterior) | |
x_fill = p_grid[hpdi[0]: hpdi[1]] | |
y_fill = posterior[hpdi[0]: hpdi[1]] | |
plt.fill_between(x_fill, y_fill, color="#5533BB70", label=hpdi_label) | |
plt.axvline(x=mode, color='red', label="Mode") | |
plt.xlabel("Proportion of black marbles") | |
plt.ylabel("Probability density") | |
plt.legend() | |
plt.grid() | |
plt.title(( | |
f"Posterior probability distribution of black marbles\n" | |
f"{mode_str}, {interval_str}" | |
)) | |
plt.tight_layout() | |
plt.show() | |
def find_hpdi(probability_density, p=0.6827): | |
""" | |
Calculate HPDI (highest posterior density interval) that contains | |
`p` probability given probability density. HPDI is the narrowest | |
interval that contain the given `p` probability. | |
Parameters | |
---------- | |
probability_density: list of float | |
Probability density values, the sum needs to be 1. | |
p: float | |
Probability for the interval. Default value `0.6827` corresponds | |
to traditional 1-sigma probability interval. | |
Returns | |
------- | |
(low, highi): | |
Returns the lower and high indices of HPDI interval from | |
`probability_density` list. | |
""" | |
# Most narrow interval containing the probability p | |
narrow_interval = (0, len(probability_density) - 1) | |
previous_effective_width = len(probability_density) | |
for i_left in range(len(probability_density) - 1): | |
prob_sum = probability_density[i_left] | |
for i_right in range(i_left + 1, len(probability_density)): | |
prob_sum += probability_density[i_right] | |
if (prob_sum > p): | |
current_width = i_right - i_left | |
effective_width = p * current_width / prob_sum | |
if effective_width < previous_effective_width: | |
previous_effective_width = effective_width | |
narrow_interval = (i_left, i_right) | |
break | |
return narrow_interval | |
if __name__ == "__main__": | |
find_posterior(M=250, N=75, p=0.93, p_hpdi=0.6827) | |
print("We are done") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment