Skip to content

Instantly share code, notes, and snippets.

View eustin's full-sized avatar
🦏

Justin Evans eustin

🦏
View GitHub Profile
@eustin
eustin / leading_commas.sql
Created March 7, 2024 23:55
leading commas
select
some_column
, another_column
, yet_another_column
from some_table
@eustin
eustin / stat_sig_random_19.py
Created May 15, 2023 22:58
stat_sig_random_19
actual_specialness_result = score_test_proportions_2indep(NUM_CONVERTING_VARIANT_USERS, NUM_VARIANT_USERS, NUM_CONVERTING_CONTROL_USERS, NUM_CONTROL_USERS, alternative="larger")
print(f"actual 'specialness' result: {actual_specialness_result.pvalue:.2%}")
#> actual 'specialness' result: 3.83%
@eustin
eustin / stat_sig_random_18.py
Created May 15, 2023 22:57
stat_sig_random_18
sampling_specialness_result = (sampled_diffs >= observed_diff_in_rates).sum() / sampled_diffs.shape[0]
print(f"sampled 'specialness' result: {sampling_specialness_result:.2%}")
#> sampled 'specialness' result: 3.41%
@eustin
eustin / stat_sig_random_17.py
Created May 15, 2023 22:56
stat_sig_random_17
plot_hist(sampled_diffs,
bins=50,
observed_rate=observed_diff_in_rates,
title="Our range of pure randomness")
@eustin
eustin / stat_sig_random_16.py
Created May 15, 2023 22:56
stat_sig_random_16
NUM_SIMULATIONS = 10_000
sampled_diffs = sample_diffs_in_rates(all_users, NUM_CONTROL_USERS, NUM_SIMULATIONS)
@eustin
eustin / stat_sig_random_15.py
Created May 15, 2023 22:55
stat_sig_random_15
@njit(parallel=True)
def sample_diffs_in_rates(all_users, num_control_users, num_simulations):
results = np.zeros(num_simulations)
for i in prange(num_simulations):
# numpy random shuffling appears to be slower when using numba
random.shuffle(all_users)
control_rate = all_users[:num_control_users].mean()
# we assume the rest of the users are variant users
variant_rate = all_users[num_control_users:].mean()
results[i] = variant_rate - control_rate
@eustin
eustin / stat_sig_random_14.py
Created May 15, 2023 22:55
stat_sig_random_14
NUM_CONTROL_USERS = 1_000_000
NUM_CONVERTING_CONTROL_USERS = 26_000
NUM_VARIANT_USERS = 1_000_000
NUM_CONVERTING_VARIANT_USERS = 26_400
# create our arrays of users
control_users = np.zeros(NUM_CONTROL_USERS)
control_users[:NUM_CONVERTING_CONTROL_USERS] = 1.0
control_conversion_rate = control_users.mean()
@eustin
eustin / stat_sig_random_13.py
Created May 15, 2023 22:53
stat_sig_random_13
OBSERVED_DIFF_IN_RATES = 0.167 # this is our experiment result
num_diffs_gte_observed = (simulated_diffs_in_rates >= OBSERVED_DIFF_IN_RATES).sum()
num_samples = simulated_diffs_in_rates.shape[0]
print(f"{num_diffs_gte_observed:,} out of {num_samples:,} random samples show differences in rates greater than or equal to {OBSERVED_DIFF_IN_RATES:.1%}")
print(f"percentage of random noise distribution with difference in rates greater than or equal to {OBSERVED_DIFF_IN_RATES:.1%}: {num_diffs_gte_observed / num_samples:.2%}")
#> 1,267 out of 10,000 random samples show differences in rates greater than or equal to 16.7%
#> percentage of random noise distribution with difference in rates greater than or equal to 16.7%: 12.67%
@eustin
eustin / stat_sig_random_12.py
Created May 15, 2023 22:52
stat_sig_random_12
plot_hist(simulated_diffs_in_rates,
bins=15,
observed_rate=0.167,
title="Our range of pure randomness")
@eustin
eustin / stat_sig_random_11.py
Created May 15, 2023 22:51
stat_sig_random_11
def plot_hist(experiment_results: np.ndarray,
bins=100,
observed_rate: float = None,
title: str = None) -> None:
sns.histplot(experiment_results, bins=bins)
if observed_rate:
plt.axvline(observed_rate, color='r', label='Diff in rates observed in experiment')
plt.legend(bbox_to_anchor=(0.5, -0.2), loc="lower center")
if title:
plt.title(title)