Created
June 6, 2018 14:34
-
-
Save debrouwere/13ac0c5a3f39aab8adac53a6bf447f59 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import matplotlib.pyplot as plt | |
import numpy as np | |
import pandas as pd | |
import seaborn as sns | |
from scipy.optimize import fmin | |
from scipy.stats import * | |
from scipy.stats.distributions import beta, binom | |
from snakify import snakify | |
# NOTE: for samples (as opposed to distributions), use pymc3.utils.hpd | |
def hpd(distribution, mass=0.95): | |
def width(lo): | |
return distribution.ppf(mass + lo) - distribution.ppf(lo) | |
# find the lower bound that minimizes interval width; | |
# use a symmetrical distribution with a/2 in each tail | |
# as the initial guess | |
symmetrical = (1.0 - mass) / 2 | |
lo = fmin(width, symmetrical, ftol=1e-6, disp=False)[0] | |
hi = mass + lo | |
return distribution.ppf([lo, hi]) | |
unemployment = pd.read_excel('vdab-schoolverlaters-2018.xlsx') | |
unemployment = unemployment.groupby(['studieniveau', 'studiegebied', 'studierichting']).sum().reset_index() | |
unemployment = unemployment[unemployment.studierichting.ne(0) & unemployment.studiegebied.ne('Geen studiegebied') & unemployment.studieniveau.ne('HBO5')].copy() | |
unemployment.rename(columns={ | |
'aantal_sv': 'n', | |
'aantal_wz': 'c', | |
}, inplace=True) | |
def fit(group): | |
try: | |
x = (group.c / group.n).clip(1e-3, 1 - 1e-3) | |
n = group.n.sum() | |
a, b, loc, scale = beta.fit(x, floc=0, fscale=1) | |
return pd.Series({'a': a, 'b': b, 'p': a / b, 'n': n}) | |
except RuntimeError: | |
return None | |
priors_by_level = unemployment.groupby('studieniveau').apply(fit).reset_index() | |
priors_by_area = unemployment.groupby(['studieniveau', 'studiegebied']).apply(fit).reset_index() | |
priors = pd.merge(priors_by_area, priors_by_level, on='studieniveau', suffixes=('_area', '_level')) | |
priors['a'] = priors.a_area.fillna(priors.a_level) | |
priors['b'] = priors.b_area.fillna(priors.b_level) | |
unemployment = pd.merge(unemployment, priors, on=['studieniveau', 'studiegebied'], how='left') | |
def interval(degree): | |
distribution = beta(degree.a + degree.c, degree.b + degree.n) | |
return pd.Series(hpd(distribution, mass=0.9), index=('lo', 'hi')) | |
unemployment['p'] = unemployment.c / unemployment.n | |
unemployment['p_adj'] = (unemployment.a + unemployment.c) / (unemployment.b + unemployment.n) | |
unemployment[['lo', 'hi']] = unemployment.apply(interval, axis=1) | |
unemployment['n_sqrt'] = np.sqrt(unemployment.n) | |
# inspect goodness of fit | |
def curve(a, b, **kwargs): | |
if not len(a) or not len(b): | |
return | |
a = a.iloc[0] | |
b = b.iloc[0] | |
x = np.arange(0.01, 1.01, 0.01) | |
y = beta(a, b).pdf(x) | |
plt.plot(x, y, color='red') | |
g = sns.FacetGrid(unemployment, col='studieniveau') | |
g = g.map(plt.hist, 'p', bins=20, density=True) | |
g = g.map(curve, 'a_level', 'b_level') | |
g.savefig('beta-fit-by-studieniveau.png') | |
g = sns.FacetGrid(unemployment, col='studieniveau', row='studiegebied') | |
g = g.map(plt.hist, 'p', bins=20, density=True) | |
g = g.map(curve, 'a_area', 'a_area') | |
g.savefig('beta-fit-by-studiegebied.png') | |
unemployment.sort_values('p_adj', inplace=True) | |
unemployment.to_csv('unemployment-credible.csv', index=False) | |
unemployment[[ | |
'studieniveau', | |
'studiegebied', | |
'studierichting', | |
'n', | |
'n_level', | |
'n_area', | |
'p', | |
'p_level', | |
'p_area', | |
'p_adj', | |
'lo', | |
'hi', | |
]].round(2) | |
sns.jointplot('p', 'aantal_sv_sqrt', data=unemployment[unemployment.p_adj.le(0.5)], xlim=(0, 0.6), ylim=(0, 15)) | |
sns.jointplot('p_adj', 'aantal_sv_sqrt', data=unemployment[unemployment.p_adj.le(0.5)], xlim=(0, 0.6), ylim=(0, 15)) | |
unemployment[unemployment.studiegebied.eq('Koeling en warmte') & unemployment.studieniveau.eq('BSO3')].round(2) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment