khakieconomics · August 9, 2018 17:09
diff --git a/run_heckman_correction.R b/run_heckman_correction.R
 library(rstan); library(tidyverse)

 # A utility function
 extract_pars <- function(mle_fit, pars) {
  unlist(lapply(pars, function(n) mle_fit$par[grepl(pattern = n, x = names(mle_fit$par))]))
 }

 # Simulate fake data ------------------------------------------------------


 N <- 1000
 P <- 10
 P2 <- 5
 X = matrix(rnorm(N*P, 0, .5), N, P)
 Z_raw = matrix(rnorm(N*P2, 0, .5), N, P2)


 gamma <- rnorm(P + P2)
 Z <- cbind(X, Z_raw)
 alpha_1 <- rnorm(1)
 alpha <- rnorm(1)
 beta <- rnorm(P)
 sigma_u <- 2

 # Draw the structural shocks
 structural_shocks <- MASS::mvrnorm(N, c(0, 0), matrix(c(1, sigma_u * .5, sigma_u * .5, sigma_u^2), 2, 2))

 # Simulate reporting a salary (participation) and the wage
 participation <- as.numeric((alpha_1 + as.matrix(Z) %*% gamma + structural_shocks[,1])> 0)
 wage <- participation * (alpha + X %*% beta + structural_shocks[,2])

 # Record the actual probabilities
 actual_p <- pnorm(alpha_1 + as.matrix(Z) %*% gamma)

 # Create the data list
 data_list <- list(N = N, 
                  P = P, 
                  P2 = P2, 
                  X = X, Z_raw = Z_raw, 
                  Y = as.numeric(wage), 
                  participation = participation, 
                  estimate_model = 1)


 # Comparing our Bayesian fit to Heckman correction ---------------------------------------

 # Vanilla OLS
 ols_fit <- lm(wage ~ ., data = as.data.frame(X) ,subset = participation==1)
 plot(coef(ols_fit), c(alpha, beta))
 abline(0, 1)

 # Heckman correction
 first_stage <- glm(participation ~ ., data = as.data.frame(Z), family = binomial(link = "probit"))
 yhat <- predict(first_stage, type = "response")
 lambda <- dnorm(yhat)/pnorm(yhat)
 second_stage <- lm(wage ~ . + lambda, data = as.data.frame(X), subset = participation == 1)
 plot(coef(second_stage)[1:(P+1)], c(alpha, beta))
 abline(0, 1)

 # Compiled and fit the model ----------------------------------------------

 compiled_model <- stan_model("heckman_correction.stan")


 fit_sim <- sampling(compiled_model, data = data_list, chains = 4, iter = 1000, cores = 4)
 fit_sim


 # Compare to posterior means

 plot(get_posterior_mean(fit_sim, pars = c("alpha", "beta"))[,5], c(alpha, beta))
 abline(0, 1)


 # Now fit by penalized likelihood

 fit_mle <- optimizing(compiled_model, data = data_list, init = 0)
 plot(extract_pars(fit_mle, c("alpha", "beta"))[-2], c(alpha, beta))
 abline(0, 1)

 plot(extract_pars(fit_mle, c("p\\[")),actual_p)
 abline(0, 1, col = 2)


 # Some rough plots of estimates vs generative actuals ---------------------

 confints <- broom::tidy(fit_sim, pars = c("alpha", "alpha_1", "beta", "gamma", "rho", "sigma_u"), conf.int = T)

 confints_p <- broom::tidy(fit_sim, pars = "p", conf.int = T)

 data_frame(parameter = c("alpha", "alpha_1", "beta", "gamma", "rho", "sigma_u"), 
           values = c(alpha, alpha_1, list(beta), list(gamma), 0.5, sigma_u)) %>% 
  unnest() %>% 
  mutate(estimates = confints$estimate,
         lower = confints$conf.low,
         upper = confints$conf.high) %>% 
  ggplot(aes(x = values, y = estimates, colour = parameter)) +
  geom_point() +
  geom_linerange(aes(ymin = lower, ymax = upper)) +
  geom_abline(aes(intercept= 0, slope = 1))


 data_frame(parameter = "p", 
           values = list(actual_p)) %>% 
  unnest() %>% 
  mutate(estimates = confints_p$estimate,
         lower = confints_p$conf.low,
         upper = confints_p$conf.high,
         misestimated = values < lower | values > upper) %>% 
  ggplot(aes(x = values, y = estimates, colour = misestimated, alpha = (misestimated+ 1)/2)) +
  geom_point() +
  scale_color_manual(values = c("black", "red")) +
  guides(alpha = F) +
  geom_linerange(aes(ymin = lower, ymax = upper)) +
  geom_abline(aes(intercept= 0, slope = 1))


 # What proportion of individual-level estimates fall within bounds? 
 # NB we probably need more iterations 
 data_frame(parameter = "p", 
           values = list(actual_p)) %>% 
  unnest() %>% 
  mutate(estimates = confints_p$estimate,
         lower = confints_p$conf.low,
         upper = confints_p$conf.high,
         misestimated = values < lower | values > upper) %>% 
  summarise(mean(misestimated))
	library(rstan); library(tidyverse)

	# A utility function
	extract_pars <- function(mle_fit, pars) {
	unlist(lapply(pars, function(n) mle_fit$par[grepl(pattern = n, x = names(mle_fit$par))]))
	}

	# Simulate fake data ------------------------------------------------------


	N <- 1000
	P <- 10
	P2 <- 5
	X = matrix(rnorm(N*P, 0, .5), N, P)
	Z_raw = matrix(rnorm(N*P2, 0, .5), N, P2)


	gamma <- rnorm(P + P2)
	Z <- cbind(X, Z_raw)
	alpha_1 <- rnorm(1)
	alpha <- rnorm(1)
	beta <- rnorm(P)
	sigma_u <- 2

	# Draw the structural shocks
	structural_shocks <- MASS::mvrnorm(N, c(0, 0), matrix(c(1, sigma_u * .5, sigma_u * .5, sigma_u^2), 2, 2))

	# Simulate reporting a salary (participation) and the wage
	participation <- as.numeric((alpha_1 + as.matrix(Z) %*% gamma + structural_shocks[,1])> 0)
	wage <- participation * (alpha + X %*% beta + structural_shocks[,2])

	# Record the actual probabilities
	actual_p <- pnorm(alpha_1 + as.matrix(Z) %*% gamma)

	# Create the data list
	data_list <- list(N = N,
	P = P,
	P2 = P2,
	X = X, Z_raw = Z_raw,
	Y = as.numeric(wage),
	participation = participation,
	estimate_model = 1)


	# Comparing our Bayesian fit to Heckman correction ---------------------------------------

	# Vanilla OLS
	ols_fit <- lm(wage ~ ., data = as.data.frame(X) ,subset = participation==1)
	plot(coef(ols_fit), c(alpha, beta))
	abline(0, 1)

	# Heckman correction
	first_stage <- glm(participation ~ ., data = as.data.frame(Z), family = binomial(link = "probit"))
	yhat <- predict(first_stage, type = "response")
	lambda <- dnorm(yhat)/pnorm(yhat)
	second_stage <- lm(wage ~ . + lambda, data = as.data.frame(X), subset = participation == 1)
	plot(coef(second_stage)[1:(P+1)], c(alpha, beta))
	abline(0, 1)

	# Compiled and fit the model ----------------------------------------------

	compiled_model <- stan_model("heckman_correction.stan")


	fit_sim <- sampling(compiled_model, data = data_list, chains = 4, iter = 1000, cores = 4)
	fit_sim


	# Compare to posterior means

	plot(get_posterior_mean(fit_sim, pars = c("alpha", "beta"))[,5], c(alpha, beta))
	abline(0, 1)


	# Now fit by penalized likelihood

	fit_mle <- optimizing(compiled_model, data = data_list, init = 0)
	plot(extract_pars(fit_mle, c("alpha", "beta"))[-2], c(alpha, beta))
	abline(0, 1)

	plot(extract_pars(fit_mle, c("p\\[")),actual_p)
	abline(0, 1, col = 2)


	# Some rough plots of estimates vs generative actuals ---------------------

	confints <- broom::tidy(fit_sim, pars = c("alpha", "alpha_1", "beta", "gamma", "rho", "sigma_u"), conf.int = T)

	confints_p <- broom::tidy(fit_sim, pars = "p", conf.int = T)

	data_frame(parameter = c("alpha", "alpha_1", "beta", "gamma", "rho", "sigma_u"),
	values = c(alpha, alpha_1, list(beta), list(gamma), 0.5, sigma_u)) %>%
	unnest() %>%
	mutate(estimates = confints$estimate,
	lower = confints$conf.low,
	upper = confints$conf.high) %>%
	ggplot(aes(x = values, y = estimates, colour = parameter)) +
	geom_point() +
	geom_linerange(aes(ymin = lower, ymax = upper)) +
	geom_abline(aes(intercept= 0, slope = 1))


	data_frame(parameter = "p",
	values = list(actual_p)) %>%
	unnest() %>%
	mutate(estimates = confints_p$estimate,
	lower = confints_p$conf.low,
	upper = confints_p$conf.high,
	misestimated = values < lower \| values > upper) %>%
	ggplot(aes(x = values, y = estimates, colour = misestimated, alpha = (misestimated+ 1)/2)) +
	geom_point() +
	scale_color_manual(values = c("black", "red")) +
	guides(alpha = F) +
	geom_linerange(aes(ymin = lower, ymax = upper)) +
	geom_abline(aes(intercept= 0, slope = 1))


	# What proportion of individual-level estimates fall within bounds?
	# NB we probably need more iterations
	data_frame(parameter = "p",
	values = list(actual_p)) %>%
	unnest() %>%
	mutate(estimates = confints_p$estimate,
	lower = confints_p$conf.low,
	upper = confints_p$conf.high,
	misestimated = values < lower \| values > upper) %>%
	summarise(mean(misestimated))