Skip to content

Instantly share code, notes, and snippets.

View eustin's full-sized avatar
🐗

Justin Evans eustin

🐗
View GitHub Profile
@eustin
eustin / stat_sig_random_8.py
Created May 15, 2023 22:47
stat_sig_random_8
simulated_control_group = all_users[:NUM_CONTROL_USERS]
simulated_variant_group = all_users[NUM_CONTROL_USERS:]
simulated_control_group, simulated_variant_group
#> (array([0., 1., 1., 0., 1., 1.]), array([1., 0., 0., 0., 1., 1.]))
print(f"simulated control conversion rate: {simulated_control_group.mean():.1%}")
print(f"simulated variant conversion rate: {simulated_variant_group.mean():.1%}")
#> simulated control conversion rate: 66.7%
#> simulated variant conversion rate: 50.0%
simulated_diff_in_rates = simulated_variant_group.mean() - simulated_control_group.mean()
print(f"simulated difference in conversion rates: {simulated_diff_in_rates:.1%}")
@eustin
eustin / stat_sig_random_9.py
Created May 15, 2023 22:49
stat_sig_random_9
NUM_SIMULATIONS = 10_000
simulated_diffs_in_rates = []
for _ in range(NUM_SIMULATIONS):
rng.shuffle(all_users)
control_conversion_rate = all_users[:NUM_CONTROL_USERS].mean()
variant_conversion_rate = all_users[NUM_CONTROL_USERS:].mean()
simulated_diffs_in_rates.append(variant_conversion_rate - control_conversion_rate)
simulated_diffs_in_rates = np.array(simulated_diffs_in_rates)
@eustin
eustin / stat_sig_random_10.py
Created May 15, 2023 22:51
stat_sig_random_10
for i, rate in enumerate(simulated_diffs_in_rates[:10]):
print(f"simulated diff in rates {i+1}: {rate:.1%}")
#> simulated diff in rates 1: -50.0%
#> simulated diff in rates 2: -16.7%
#> simulated diff in rates 3: 16.7%
#> simulated diff in rates 4: 16.7%
#> simulated diff in rates 5: 50.0%
#> simulated diff in rates 6: 50.0%
#> simulated diff in rates 7: -16.7%
@eustin
eustin / stat_sig_random_11.py
Created May 15, 2023 22:51
stat_sig_random_11
def plot_hist(experiment_results: np.ndarray,
bins=100,
observed_rate: float = None,
title: str = None) -> None:
sns.histplot(experiment_results, bins=bins)
if observed_rate:
plt.axvline(observed_rate, color='r', label='Diff in rates observed in experiment')
plt.legend(bbox_to_anchor=(0.5, -0.2), loc="lower center")
if title:
plt.title(title)
@eustin
eustin / stat_sig_random_12.py
Created May 15, 2023 22:52
stat_sig_random_12
plot_hist(simulated_diffs_in_rates,
bins=15,
observed_rate=0.167,
title="Our range of pure randomness")
@eustin
eustin / stat_sig_random_13.py
Created May 15, 2023 22:53
stat_sig_random_13
OBSERVED_DIFF_IN_RATES = 0.167 # this is our experiment result
num_diffs_gte_observed = (simulated_diffs_in_rates >= OBSERVED_DIFF_IN_RATES).sum()
num_samples = simulated_diffs_in_rates.shape[0]
print(f"{num_diffs_gte_observed:,} out of {num_samples:,} random samples show differences in rates greater than or equal to {OBSERVED_DIFF_IN_RATES:.1%}")
print(f"percentage of random noise distribution with difference in rates greater than or equal to {OBSERVED_DIFF_IN_RATES:.1%}: {num_diffs_gte_observed / num_samples:.2%}")
#> 1,267 out of 10,000 random samples show differences in rates greater than or equal to 16.7%
#> percentage of random noise distribution with difference in rates greater than or equal to 16.7%: 12.67%
@eustin
eustin / stat_sig_random_14.py
Created May 15, 2023 22:55
stat_sig_random_14
NUM_CONTROL_USERS = 1_000_000
NUM_CONVERTING_CONTROL_USERS = 26_000
NUM_VARIANT_USERS = 1_000_000
NUM_CONVERTING_VARIANT_USERS = 26_400
# create our arrays of users
control_users = np.zeros(NUM_CONTROL_USERS)
control_users[:NUM_CONVERTING_CONTROL_USERS] = 1.0
control_conversion_rate = control_users.mean()
@eustin
eustin / stat_sig_random_15.py
Created May 15, 2023 22:55
stat_sig_random_15
@njit(parallel=True)
def sample_diffs_in_rates(all_users, num_control_users, num_simulations):
results = np.zeros(num_simulations)
for i in prange(num_simulations):
# numpy random shuffling appears to be slower when using numba
random.shuffle(all_users)
control_rate = all_users[:num_control_users].mean()
# we assume the rest of the users are variant users
variant_rate = all_users[num_control_users:].mean()
results[i] = variant_rate - control_rate
@eustin
eustin / stat_sig_random_16.py
Created May 15, 2023 22:56
stat_sig_random_16
NUM_SIMULATIONS = 10_000
sampled_diffs = sample_diffs_in_rates(all_users, NUM_CONTROL_USERS, NUM_SIMULATIONS)