Created
March 1, 2024 03:15
-
-
Save notionparallax/be5161be654a2558e06f79639b2e4cf7 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# %% | |
import textwrap | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
import pandas as pd | |
import numpy as np | |
# %% | |
def random_from_distribution(a, b, number_of_sigma=3, as_int=True): | |
# calculate mean and standard deviation | |
mu = (a + b) / 2 | |
sigma = abs(mu - a) / number_of_sigma | |
# generate a random number from the normal distribution | |
random_number = np.random.normal(mu, sigma) | |
if as_int: | |
return int(random_number) | |
return random_number | |
# %% Balancing strategies | |
strategies = { | |
"real_BVN_numbers": { | |
"Prinicipal": {"m": 11, "f": 6, "abbr": "P", "s_low": 80, "s_high": 80}, | |
"Senior Practice Director": { | |
"m": 16, | |
"f": 7, | |
"abbr": "SPD", | |
"s_low": 150, | |
"s_high": 250, | |
}, | |
"Practice Director": { | |
"m": 16, | |
"f": 13, | |
"abbr": "PD", | |
"s_low": 130, | |
"s_high": 190, | |
}, | |
"Senior Associate": { | |
"m": 21, | |
"f": 22, | |
"abbr": "SA", | |
"s_low": 120, | |
"s_high": 140, | |
}, | |
"Interior Designer": { | |
"m": 0, | |
"f": 14, | |
"abbr": "ID", | |
"s_low": 80, | |
"s_high": 120, | |
}, | |
"Architect": {"m": 23, "f": 16, "abbr": "Arch", "s_low": 85, "s_high": 120}, | |
"Architecture Grad": { | |
"m": 22, | |
"f": 17, | |
"abbr": "Grad", | |
"s_low": 75, | |
"s_high": 90, | |
}, | |
"Student": {"m": 2, "f": 9, "abbr": "Student", "s_low": 65, "s_high": 80}, | |
"Other Streams": { | |
"m": 21, | |
"f": 33, | |
"abbr": "Other", | |
"s_low": 70, | |
"s_high": 170, | |
}, | |
"BIM Manager": {"m": 5, "f": 1, "abbr": "BM", "s_low": 110, "s_high": 150}, | |
"Technical": {"m": 8, "f": 4, "abbr": "T", "s_low": 100, "s_high": 200}, | |
}, | |
"balanced_numbers": { | |
"Prinicipal": {"m": 8, "f": 8, "abbr": "P", "s_low": 80, "s_high": 80}, | |
"Senior Practice Director": { | |
"m": 11, | |
"f": 11, | |
"abbr": "SPD", | |
"s_low": 150, | |
"s_high": 250, | |
}, | |
"Practice Director": { | |
"m": 15, | |
"f": 15, | |
"abbr": "PD", | |
"s_low": 130, | |
"s_high": 190, | |
}, | |
"Senior Associate": { | |
"m": 21, | |
"f": 21, | |
"abbr": "SA", | |
"s_low": 120, | |
"s_high": 140, | |
}, | |
"Interior Designer": {"m": 7, "f": 7, "abbr": "ID", "s_low": 80, "s_high": 120}, | |
"Architect": {"m": 20, "f": 20, "abbr": "Arch", "s_low": 85, "s_high": 120}, | |
"Architecture Grad": { | |
"m": 19, | |
"f": 19, | |
"abbr": "Grad", | |
"s_low": 75, | |
"s_high": 90, | |
}, | |
"Student": {"m": 5, "f": 5, "abbr": "Student", "s_low": 65, "s_high": 80}, | |
"Other Streams": { | |
"m": 26, | |
"f": 26, | |
"abbr": "Other", | |
"s_low": 70, | |
"s_high": 170, | |
}, | |
"BIM Manager": {"m": 3, "f": 3, "abbr": "BM", "s_low": 110, "s_high": 150}, | |
"Technical": {"m": 6, "f": 6, "abbr": "T", "s_low": 100, "s_high": 200}, | |
}, | |
"male_students": { | |
"Prinicipal": {"m": 11, "f": 6, "abbr": "P", "s_low": 80, "s_high": 80}, | |
"Senior Practice Director": { | |
"m": 16, | |
"f": 7, | |
"abbr": "SPD", | |
"s_low": 150, | |
"s_high": 250, | |
}, | |
"Practice Director": { | |
"m": 16, | |
"f": 13, | |
"abbr": "PD", | |
"s_low": 130, | |
"s_high": 190, | |
}, | |
"Senior Associate": { | |
"m": 21, | |
"f": 22, | |
"abbr": "SA", | |
"s_low": 120, | |
"s_high": 140, | |
}, | |
"Interior Designer": { | |
"m": 0, | |
"f": 14, | |
"abbr": "ID", | |
"s_low": 80, | |
"s_high": 120, | |
}, | |
"Architect": {"m": 23, "f": 16, "abbr": "Arch", "s_low": 85, "s_high": 120}, | |
"Architecture Grad": { | |
"m": 22, | |
"f": 17, | |
"abbr": "Grad", | |
"s_low": 75, | |
"s_high": 90, | |
}, | |
"Student": {"m": 16, "f": 0, "abbr": "Student", "s_low": 65, "s_high": 80}, | |
"Other Streams": { | |
"m": 21, | |
"f": 33, | |
"abbr": "Other", | |
"s_low": 70, | |
"s_high": 170, | |
}, | |
"BIM Manager": {"m": 5, "f": 1, "abbr": "BM", "s_low": 110, "s_high": 150}, | |
"Technical": {"m": 8, "f": 4, "abbr": "T", "s_low": 100, "s_high": 200}, | |
}, | |
"fire_the_boys": { | |
"Prinicipal": {"m": 11, "f": 6, "abbr": "P", "s_low": 80, "s_high": 80}, | |
"Senior Practice Director": { | |
"m": 10, | |
"f": 13, | |
"abbr": "SPD", | |
"s_low": 150, | |
"s_high": 250, | |
}, | |
"Practice Director": { | |
"m": 16, | |
"f": 13, | |
"abbr": "PD", | |
"s_low": 130, | |
"s_high": 190, | |
}, | |
"Senior Associate": { | |
"m": 21, | |
"f": 22, | |
"abbr": "SA", | |
"s_low": 120, | |
"s_high": 140, | |
}, | |
"Interior Designer": { | |
"m": 0, | |
"f": 14, | |
"abbr": "ID", | |
"s_low": 80, | |
"s_high": 120, | |
}, | |
"Architect": {"m": 23, "f": 16, "abbr": "Arch", "s_low": 85, "s_high": 120}, | |
"Architecture Grad": { | |
"m": 22, | |
"f": 17, | |
"abbr": "Grad", | |
"s_low": 75, | |
"s_high": 90, | |
}, | |
"Student": {"m": 2, "f": 9, "abbr": "Student", "s_low": 65, "s_high": 80}, | |
"Other Streams": { | |
"m": 21, | |
"f": 33, | |
"abbr": "Other", | |
"s_low": 70, | |
"s_high": 170, | |
}, | |
"BIM Manager": {"m": 5, "f": 1, "abbr": "BM", "s_low": 110, "s_high": 150}, | |
"Technical": {"m": 8, "f": 4, "abbr": "T", "s_low": 100, "s_high": 200}, | |
}, | |
} | |
people_meta = strategies["balanced_numbers"] | |
# %% | |
def make_people_data(): | |
"""Convert the summary data (above) into a list of people's salaries.""" | |
people = [] | |
for rank, data in people_meta.items(): | |
for sex in ["m", "f"]: | |
for _ in range(data[sex]): | |
people.append( | |
{ | |
"rank": rank, | |
"sex": sex, | |
"salary": random_from_distribution( | |
data["s_low"], data["s_high"] | |
), | |
"abbr": data["abbr"], | |
} | |
) | |
p_df = pd.DataFrame(people) | |
return p_df | |
people_df = make_people_data() | |
# %% | |
# calculate mean salary for each gender within each rank | |
mean_salary = people_df.groupby(["rank", "sex"]).agg( | |
Mean=("salary", np.mean), Median=("salary", np.median) | |
) | |
# unstack the multi-index series to a dataframe | |
summary_salary_df = mean_salary.unstack() | |
# calculate the percentage difference for each rank | |
summary_salary_df["pc_diff_mean"] = ( | |
(summary_salary_df["Mean"]["m"] - summary_salary_df["Mean"]["f"]) | |
/ summary_salary_df["Mean"]["f"] | |
) * 100 | |
summary_salary_df["pc_diff_median"] = ( | |
(summary_salary_df["Median"]["m"] - summary_salary_df["Median"]["f"]) | |
/ summary_salary_df["Median"]["f"] | |
) * 100 | |
# %% | |
summary_salary_df | |
# %% | |
def mean_and_median_delta(df): | |
m_f_df_mean = df.groupby("sex").salary.mean() | |
m_f_df_median = df.groupby("sex").salary.median() | |
pc_delta_mean = ( | |
(m_f_df_mean.loc["m"] - m_f_df_mean.loc["f"]) / m_f_df_mean.loc["f"] | |
) * 100 | |
pc_delta_median = ( | |
(m_f_df_median.loc["m"] - m_f_df_median.loc["f"]) / m_f_df_median.loc["f"] | |
) * 100 | |
return pc_delta_mean, pc_delta_median | |
pc_delta_mean, pc_delta_median = mean_and_median_delta(people_df) | |
print( | |
f""" | |
The mean salary gap for this cohort is {round(pc_delta_mean, 2)}%. | |
The median salary gap for this cohort is {round(pc_delta_median, 2)}%.""" | |
) | |
# %% | |
def draw_v_plot(mean_or_median="mean"): | |
"""Draw a Violin Plot of gender pay gap segregated by title.""" | |
if mean_or_median == "mean": | |
summary_column = "Mean" | |
palette = "summer" | |
elif mean_or_median == "median": | |
summary_column = "Median" | |
palette = "spring" | |
else: | |
print("we're only supporting [mean|median] at the moment, use one of those two") | |
return | |
fig, ax = plt.subplots(figsize=(16, 9)) | |
sns.set_theme(style="whitegrid") | |
sns.violinplot( | |
ax=ax, | |
data=people_df, | |
x="rank", | |
y="salary", | |
hue="sex", | |
palette=palette, | |
split=True, | |
) | |
ax.set( | |
xlabel="", | |
ylabel="Salary ($)", | |
title=( | |
"Salary distributuion violin plot, grouped by title and sex\n" | |
"Overall gender pay gap for this group is " | |
f"{round(pc_delta_mean, 2)}% (mean) and {round(pc_delta_median, 2)}% (median).\n" | |
f"Using {summary_column} as summary method" | |
), | |
ylim=(0, 275), | |
) | |
wrap_point = 12 | |
for name, d in summary_salary_df.iterrows(): | |
mx = round(d[summary_column]["m"], 1) | |
fx = round(d[summary_column]["f"], 1) | |
delta_pc_mean = round(d["pc_diff_mean"][0], 1) | |
delta_pc_median = round(d["pc_diff_median"][0], 1) | |
rank_name = textwrap.fill(name, wrap_point) | |
ax.text( | |
[tl.get_text() for tl in ax.get_xticklabels()].index(name), | |
5, | |
f"""{rank_name} | |
mX̄: ${mx} | |
fX̄: ${fx} | |
X̄∆: {delta_pc_mean}% | |
x͂∆: {delta_pc_median}%""", | |
fontsize=10, | |
ha="left", | |
) | |
ax.set_xticklabels( | |
[textwrap.fill(t.get_text(), wrap_point) for t in ax.get_xticklabels()] | |
) | |
sns.despine() | |
plt.tight_layout() | |
plt.show() | |
draw_v_plot(mean_or_median="mean") | |
# %% | |
draw_v_plot(mean_or_median="median") | |
# %% | |
# TODO: filter out rank|sex that have fewer than N members. E.g. don't show the salary of the one female bim manager | |
# TODO: Move the summary text over to the left. I'm not sure if it can be aligned on the colon | |
# TODO: align the colours to the brand colours | |
# ✔ TODO: show overall sex pay gap | |
# TODO: show mean and median on the same hist | |
# %% | |
def gap_distribution(runs=10000): | |
"""Run a simulation of this company's data `runs` times. | |
Return a list of pay gap percentages.""" | |
gap_list_mean = [] | |
gap_list_median = [] | |
for _ in range(runs): | |
p_df = make_people_data() | |
pc_delta_mean, pc_delta_median = mean_and_median_delta(p_df) | |
gap_list_mean.append(pc_delta_mean) | |
gap_list_median.append(pc_delta_median) | |
return gap_list_mean, gap_list_median | |
def draw_histogram(gap_list_mean, gap_list_median, sim_run_count=10000): | |
fig, ax = plt.subplots(figsize=(16, 9)) | |
ax.hist(gap_list_mean, bins=100, histtype="step") | |
ax.hist(gap_list_median, bins=100, histtype="step") | |
# pd.Series(gap_list_mean).plot(kind='density') | |
# pd.Series(gap_list_median).plot(kind='density') | |
ax.set( | |
xlabel="% pay gap, +ve favours men", | |
ylabel="Number of sim runs with this %", | |
title=f"Distribution of gender pay gap over {sim_run_count} simulations", | |
) | |
# %% | |
gap_list_mean, gap_list_median = gap_distribution(runs=10000) | |
# %% | |
draw_histogram(gap_list_mean, gap_list_median, sim_run_count=10000) | |
# %% |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment