Created
November 15, 2019 00:35
-
-
Save halloleo/575ab08f7474a44260b9d803249762a8 to your computer and use it in GitHub Desktop.
Sample a given population many times
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Sample a given population many times with a fixed confidence level | |
""" | |
from pathlib import Path | |
import math | |
import csv | |
import numpy as np | |
import pandas as pd | |
import scipy.stats as stats | |
CACHE_DIR= 'cache' | |
# | |
# Utility functions | |
# | |
def in_percent(v): | |
"""Print value v as a percentage""" | |
print(f"{v*100:.1f}%") | |
def calc_50percent_df(n): | |
# Make list `one_lst` of exactly n/2 random indices | |
take_out_lst = [ i for i in range(n) ] | |
one_lst = [] | |
for i in range(n//2): | |
k = len(take_out_lst) | |
r = np.random.randint(0,k) | |
one_lst.append(take_out_lst[r]) | |
del(take_out_lst[r]) | |
# Make list and Dataframe where for the indices from `one_lst` are set to 1. | |
popl = [0] *n | |
for i in one_lst: | |
popl[i]=1 | |
df = pd.DataFrame(popl, columns=['vote']) | |
return df | |
def create_50percent_df(n=100000, use_cache=True): | |
""" | |
Create a random population with exact 50% of the population | |
having some feature (e.g. said yes to a question). | |
:param n: population size | |
:return: DataFrame | |
""" | |
cache_path = Path(f'{CACHE_DIR}/pop_{n}.pkl') | |
if cache_path.exists() and use_cache: | |
return pd.read_pickle(cache_path) | |
else: | |
df = calc_50percent_df(n) | |
df.to_pickle(cache_path) | |
return df | |
def find_zstar(cl): | |
""" | |
Calculate zstar from the confidence level | |
via zscore in scipy stats package | |
""" | |
cdf = 0.5+cl/2 | |
zstar = stats.norm.ppf(cdf) | |
return zstar | |
def find_CI(sample, cl, colname='vote', f=lambda x: x==1): | |
""" | |
Find the confidence interval for a sample | |
:param sample: Dataframe with one sample column | |
:param colname: name of column to look into for "yes votes" | |
:param f: test function for "yes votes" | |
:param cl: confidence level | |
:return: Confidence interval as pandas Interval | |
""" | |
n = len(sample) | |
# Count "yes votes" | |
x = len(sample[f(sample[colname]) == True]) | |
p_hat = x/n | |
# Margin of Error | |
sigma_p_hat = math.sqrt(p_hat * (1 - p_hat) / n) | |
z_star = find_zstar(cl) # from confidence level via scipy | |
E = z_star * sigma_p_hat # Margin of Error | |
return pd.Interval(p_hat-E, p_hat+E, closed='both') | |
def create_many_CIs(popdf, num_of_samples=100, sample_n=25, cl=0.95): | |
""" | |
Create an array of CIs from random samples | |
:param popdf: the population data | |
:param num_of_samples: number of samples to generate | |
:param sample_n: sample size | |
:param cl: confidence level | |
:return: ndarray of CIs (pandas Intervals) | |
""" | |
res = np.array([]) | |
for i in range(num_of_samples): | |
df = popdf.sample(sample_n) | |
df = df.reset_index(drop=True) | |
CI = find_CI(df, cl) | |
res = np.append(res, CI) | |
return res | |
def test_many_CIs(popdf, p, num_of_samples=100, sample_n=25, cl=0.95): | |
CIs = create_many_CIs(popdf, num_of_samples, sample_n, cl) | |
num_of_containing_CIs = len([ CI for CI in CIs if p in CI ]) | |
print(f"num_of_samples = {num_of_samples}") | |
print(f"num_of_containing_CIs = {num_of_containing_CIs}") | |
print(f"ratio = {num_of_containing_CIs/num_of_samples}") | |
def write_growing_CI_numbers(popdf, p, max_num_of_samples=1000, sample_n=50, | |
cl=0.95, csv_stem='growing_CI_nums'): | |
print(f"Generate {max_num_of_samples} CIs...", end='') | |
CIs = create_many_CIs(popdf, max_num_of_samples, sample_n, cl) | |
print("done") | |
max_exp = round(math.log(max_num_of_samples, 10)) | |
csv_fname = f'{csv_stem}_cl={cl}_n={sample_n}_N={len(popdf)}_mexp={max_exp}.csv' | |
print(f"Write {csv_fname} ...", end='') | |
with open(csv_fname, 'w') as csv_f: | |
writer = csv.writer(csv_f) | |
writer.writerow(['num_of_samples', 'num_of_containing_CIs', 'ratio']) | |
cnt=0 | |
for num_of_samples in [pow(10, e) for e in range(1, max_exp)] + [ | |
max_num_of_samples]: | |
num_of_containing_CIs = len( | |
[CI for CI in CIs[:num_of_samples] if p in CI]) | |
ratio = num_of_containing_CIs/num_of_samples | |
writer.writerow([num_of_samples, num_of_containing_CIs, ratio]) | |
csv_f.flush() | |
cnt += 1 | |
print(f"done ({cnt} lines written)") | |
if __name__ == '__main__': | |
print("--- Population ---") | |
popdf = create_50percent_df() | |
print(popdf.describe()) | |
p = len(popdf[popdf.vote == 1]) / len(popdf) | |
print(f"p = {p}") | |
print("\n--- Growing samples ---") | |
write_growing_CI_numbers(popdf, p, max_num_of_samples=10000000, csv_stem='growing_CI_nums') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment