Last active
August 9, 2018 17:09
-
-
Save khakieconomics/423825e2b83c448d95b5f7198b532fdd to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(rstan); library(tidyverse) | |
# A utility function | |
extract_pars <- function(mle_fit, pars) { | |
unlist(lapply(pars, function(n) mle_fit$par[grepl(pattern = n, x = names(mle_fit$par))])) | |
} | |
# Simulate fake data ------------------------------------------------------ | |
N <- 1000 | |
P <- 10 | |
P2 <- 5 | |
X = matrix(rnorm(N*P, 0, .5), N, P) | |
Z_raw = matrix(rnorm(N*P2, 0, .5), N, P2) | |
gamma <- rnorm(P + P2) | |
Z <- cbind(X, Z_raw) | |
alpha_1 <- rnorm(1) | |
alpha <- rnorm(1) | |
beta <- rnorm(P) | |
sigma_u <- 2 | |
# Draw the structural shocks | |
structural_shocks <- MASS::mvrnorm(N, c(0, 0), matrix(c(1, sigma_u * .5, sigma_u * .5, sigma_u^2), 2, 2)) | |
# Simulate reporting a salary (participation) and the wage | |
participation <- as.numeric((alpha_1 + as.matrix(Z) %*% gamma + structural_shocks[,1])> 0) | |
wage <- participation * (alpha + X %*% beta + structural_shocks[,2]) | |
# Record the actual probabilities | |
actual_p <- pnorm(alpha_1 + as.matrix(Z) %*% gamma) | |
# Create the data list | |
data_list <- list(N = N, | |
P = P, | |
P2 = P2, | |
X = X, Z_raw = Z_raw, | |
Y = as.numeric(wage), | |
participation = participation, | |
estimate_model = 1) | |
# Comparing our Bayesian fit to Heckman correction --------------------------------------- | |
# Vanilla OLS | |
ols_fit <- lm(wage ~ ., data = as.data.frame(X) ,subset = participation==1) | |
plot(coef(ols_fit), c(alpha, beta)) | |
abline(0, 1) | |
# Heckman correction | |
first_stage <- glm(participation ~ ., data = as.data.frame(Z), family = binomial(link = "probit")) | |
yhat <- predict(first_stage, type = "response") | |
lambda <- dnorm(yhat)/pnorm(yhat) | |
second_stage <- lm(wage ~ . + lambda, data = as.data.frame(X), subset = participation == 1) | |
plot(coef(second_stage)[1:(P+1)], c(alpha, beta)) | |
abline(0, 1) | |
# Compiled and fit the model ---------------------------------------------- | |
compiled_model <- stan_model("heckman_correction.stan") | |
fit_sim <- sampling(compiled_model, data = data_list, chains = 4, iter = 1000, cores = 4) | |
fit_sim | |
# Compare to posterior means | |
plot(get_posterior_mean(fit_sim, pars = c("alpha", "beta"))[,5], c(alpha, beta)) | |
abline(0, 1) | |
# Now fit by penalized likelihood | |
fit_mle <- optimizing(compiled_model, data = data_list, init = 0) | |
plot(extract_pars(fit_mle, c("alpha", "beta"))[-2], c(alpha, beta)) | |
abline(0, 1) | |
plot(extract_pars(fit_mle, c("p\\[")),actual_p) | |
abline(0, 1, col = 2) | |
# Some rough plots of estimates vs generative actuals --------------------- | |
confints <- broom::tidy(fit_sim, pars = c("alpha", "alpha_1", "beta", "gamma", "rho", "sigma_u"), conf.int = T) | |
confints_p <- broom::tidy(fit_sim, pars = "p", conf.int = T) | |
data_frame(parameter = c("alpha", "alpha_1", "beta", "gamma", "rho", "sigma_u"), | |
values = c(alpha, alpha_1, list(beta), list(gamma), 0.5, sigma_u)) %>% | |
unnest() %>% | |
mutate(estimates = confints$estimate, | |
lower = confints$conf.low, | |
upper = confints$conf.high) %>% | |
ggplot(aes(x = values, y = estimates, colour = parameter)) + | |
geom_point() + | |
geom_linerange(aes(ymin = lower, ymax = upper)) + | |
geom_abline(aes(intercept= 0, slope = 1)) | |
data_frame(parameter = "p", | |
values = list(actual_p)) %>% | |
unnest() %>% | |
mutate(estimates = confints_p$estimate, | |
lower = confints_p$conf.low, | |
upper = confints_p$conf.high, | |
misestimated = values < lower | values > upper) %>% | |
ggplot(aes(x = values, y = estimates, colour = misestimated, alpha = (misestimated+ 1)/2)) + | |
geom_point() + | |
scale_color_manual(values = c("black", "red")) + | |
guides(alpha = F) + | |
geom_linerange(aes(ymin = lower, ymax = upper)) + | |
geom_abline(aes(intercept= 0, slope = 1)) | |
# What proportion of individual-level estimates fall within bounds? | |
# NB we probably need more iterations | |
data_frame(parameter = "p", | |
values = list(actual_p)) %>% | |
unnest() %>% | |
mutate(estimates = confints_p$estimate, | |
lower = confints_p$conf.low, | |
upper = confints_p$conf.high, | |
misestimated = values < lower | values > upper) %>% | |
summarise(mean(misestimated)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment