Skip to content

Instantly share code, notes, and snippets.

@debrouwere
Created June 6, 2018 14:34
Show Gist options
  • Save debrouwere/13ac0c5a3f39aab8adac53a6bf447f59 to your computer and use it in GitHub Desktop.
Save debrouwere/13ac0c5a3f39aab8adac53a6bf447f59 to your computer and use it in GitHub Desktop.
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from scipy.optimize import fmin
from scipy.stats import *
from scipy.stats.distributions import beta, binom
from snakify import snakify
# NOTE: for samples (as opposed to distributions), use pymc3.utils.hpd
def hpd(distribution, mass=0.95):
def width(lo):
return distribution.ppf(mass + lo) - distribution.ppf(lo)
# find the lower bound that minimizes interval width;
# use a symmetrical distribution with a/2 in each tail
# as the initial guess
symmetrical = (1.0 - mass) / 2
lo = fmin(width, symmetrical, ftol=1e-6, disp=False)[0]
hi = mass + lo
return distribution.ppf([lo, hi])
unemployment = pd.read_excel('vdab-schoolverlaters-2018.xlsx')
unemployment = unemployment.groupby(['studieniveau', 'studiegebied', 'studierichting']).sum().reset_index()
unemployment = unemployment[unemployment.studierichting.ne(0) & unemployment.studiegebied.ne('Geen studiegebied') & unemployment.studieniveau.ne('HBO5')].copy()
unemployment.rename(columns={
'aantal_sv': 'n',
'aantal_wz': 'c',
}, inplace=True)
def fit(group):
try:
x = (group.c / group.n).clip(1e-3, 1 - 1e-3)
n = group.n.sum()
a, b, loc, scale = beta.fit(x, floc=0, fscale=1)
return pd.Series({'a': a, 'b': b, 'p': a / b, 'n': n})
except RuntimeError:
return None
priors_by_level = unemployment.groupby('studieniveau').apply(fit).reset_index()
priors_by_area = unemployment.groupby(['studieniveau', 'studiegebied']).apply(fit).reset_index()
priors = pd.merge(priors_by_area, priors_by_level, on='studieniveau', suffixes=('_area', '_level'))
priors['a'] = priors.a_area.fillna(priors.a_level)
priors['b'] = priors.b_area.fillna(priors.b_level)
unemployment = pd.merge(unemployment, priors, on=['studieniveau', 'studiegebied'], how='left')
def interval(degree):
distribution = beta(degree.a + degree.c, degree.b + degree.n)
return pd.Series(hpd(distribution, mass=0.9), index=('lo', 'hi'))
unemployment['p'] = unemployment.c / unemployment.n
unemployment['p_adj'] = (unemployment.a + unemployment.c) / (unemployment.b + unemployment.n)
unemployment[['lo', 'hi']] = unemployment.apply(interval, axis=1)
unemployment['n_sqrt'] = np.sqrt(unemployment.n)
# inspect goodness of fit
def curve(a, b, **kwargs):
if not len(a) or not len(b):
return
a = a.iloc[0]
b = b.iloc[0]
x = np.arange(0.01, 1.01, 0.01)
y = beta(a, b).pdf(x)
plt.plot(x, y, color='red')
g = sns.FacetGrid(unemployment, col='studieniveau')
g = g.map(plt.hist, 'p', bins=20, density=True)
g = g.map(curve, 'a_level', 'b_level')
g.savefig('beta-fit-by-studieniveau.png')
g = sns.FacetGrid(unemployment, col='studieniveau', row='studiegebied')
g = g.map(plt.hist, 'p', bins=20, density=True)
g = g.map(curve, 'a_area', 'a_area')
g.savefig('beta-fit-by-studiegebied.png')
unemployment.sort_values('p_adj', inplace=True)
unemployment.to_csv('unemployment-credible.csv', index=False)
unemployment[[
'studieniveau',
'studiegebied',
'studierichting',
'n',
'n_level',
'n_area',
'p',
'p_level',
'p_area',
'p_adj',
'lo',
'hi',
]].round(2)
sns.jointplot('p', 'aantal_sv_sqrt', data=unemployment[unemployment.p_adj.le(0.5)], xlim=(0, 0.6), ylim=(0, 15))
sns.jointplot('p_adj', 'aantal_sv_sqrt', data=unemployment[unemployment.p_adj.le(0.5)], xlim=(0, 0.6), ylim=(0, 15))
unemployment[unemployment.studiegebied.eq('Koeling en warmte') & unemployment.studieniveau.eq('BSO3')].round(2)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment