eveskew · December 4, 2018 20:12
diff --git a/discrete_missingness_multiple_cats.R b/discrete_missingness_multiple_cats.R
 # Categorical missing data in Stan

 # Demonstrated with three categories

 # Code and ideas adapted from @rmcelreath
 # https://gist.github.com/rmcelreath/9406643583a8c99304e459e644762f82

 # Code and ideas discussed with @dmontecino
 # https://gist.github.com/dmontecino/b804853e4b36a57990a7108a35201cf5


 library(rstan)
 library(rethinking) # devtools::install_github("rmcelreath/rethinking")

 #==============================================================================


 # Data setup


 # Note: in this simulated data, categories are unbalanced and the data is
 # missing at random

 N <- 5000
 N_missing <- 100
 K <- 3 # number of categories

 # Generate unordered categorical covariates, with unbalanced observations
 # Category 1 = 50% of simulated data
 # Category 2 = 30% of simulated data
 # Category 3 = 20% of simulated data
 x <- rmultinom(N, size = 1, prob = c(0.5, 0.3, 0.2)) 
 x <- sapply(1:N, function(i) which(x[ , i] == 1))

 # Simulate bivariate response as a function of the category
 y <- rep(NA, N)
 for (i in 1:N) {
  
  if (x[i] == 1)
    y[i] = rbinom(n = 1, size = 1, prob = 0.8)
  
  else if (x[i] == 2)
    y[i] = rbinom(n = 1, size = 1, prob = 0.3)
  
  else # x = 3
    y[i] = rbinom(n = 1, size = 1, prob = 0.1)
 }

 # Simulate missing data
 i_miss <- sample(1:N, size = N_missing)
 x_obs <- x
 x_obs[i_miss] <- (-1) # placeholder, Stan will not accept NA values
 x_miss <- ifelse(1:N %in% i_miss, 1, 0)

 # Create a covariate for use in prediction of the categorical variable
 # Note: in this case, the covariate does not actually correlate with the 
 # category, but the code is such that this could be easily changed
 cov_for_x <- NA
 cov_for_x[x_obs == 1] <- 
  rbinom(length(cov_for_x[x_obs == 1]), size = 1, prob = 0.5)
 cov_for_x[x_obs == 2] <- 
  rbinom(length(cov_for_x[x_obs == 2]), size = 1, prob = 0.5)
 cov_for_x[x_obs == 3] <- 
  rbinom(length(cov_for_x[x_obs == 3]), size = 1, prob = 0.5)
 cov_for_x[is.na(cov_for_x)] <- 
  rbinom(length(cov_for_x[is.na(cov_for_x)]), size = 1, prob = 0.5)

 # Create dummy variables for use in the model
 x_cat_1 <- ifelse(x_obs == 1, 1, 0)
 x_cat_2 <- ifelse(x_obs == 2, 1, 0)
 x_cat_3 <- ifelse(x_obs == 3, 1, 0)
 cov_for_x_for_miss_cat <- cov_for_x[x_miss == 1]


 # Examine distribution of full simulated data
 simplehist(x)

 # Examine distribution of simulated data that goes unobserved
 simplehist(x[x_miss == 1])

 # So in this case, the categories are unbalanced, but the unobserved data
 # reflects this same pattern

 #==============================================================================


 # Model definition


 model <- "

 data {

 int N; // number of observations
 int K; // number of categories
 int y[N]; // binary outcome variable
 int x_obs[N]; // observed categorical variable (-1 when unobserved)
 int x_miss[N]; // dummy variable indicating missingness in the categorical variable
 int cov_for_x[N];  // a binary predictor for the x categorical variable
 int x_cat_1[N]; // dummy variable indicating when x has the first level
 int x_cat_2[N]; // dummy variable indicating when x has the second level 
 int x_cat_3[N]; // dummy variable indicating when x has the third level
 }

 ///////////////////////////////////////////////////////////////////////////////

 parameters {

 real beta1; // coefficient for categorical variable level 1
 real beta2; // coefficient for categorical variable level 2
 real beta3; // coefficient for categorical variable level 3

 real a_cat2; // intercept to model the probability of category 2
 real a_cat3; // intercept to model the probability of category 3
 real b_cat2; // coefficient to model the probability of category 2
 real b_cat3; // coefficient to model the probability of category 3
 }

 ///////////////////////////////////////////////////////////////////////////////

 model {

 // priors for all parameters
 beta1 ~ normal(0, 1); 
 beta2 ~ normal(0, 1); 
 beta3 ~ normal(0, 1); 
 a_cat2 ~ normal(0, 1); 
 a_cat3 ~ normal(0, 1); 
 b_cat2 ~ normal(0, 1); 
 b_cat3 ~ normal(0, 1); 

 // likelihood
 for (i in 1:N) {

 vector[K] score; // vector containing log-odds of being in category 1, 2, or 3
 score[1] = 0; // score[1] is fixed at zero
 score[2] = a_cat2 + b_cat2*cov_for_x[i]; // modeling the log-odds of category 2
 score[3] = a_cat3 + b_cat3*cov_for_x[i]; // modeling the log-odds of category 3

 if (x_miss[i] == 1) { // x is unobserved

 // model the binary outcome, marginalizing over missingness

 vector[K] logPxy; // vector to hold the log probabilities for each alternate scenario (category 1, 2, or 3)
 // log_softmax is a vector of log probabilities for each x category
 logPxy[1] = log_softmax(score)[1] + bernoulli_logit_lpmf(y[i] | beta1); // category 1
 logPxy[2] = log_softmax(score)[2] + bernoulli_logit_lpmf(y[i] | beta2); // category 2
 logPxy[3] = log_softmax(score)[3] + bernoulli_logit_lpmf(y[i] | beta3); // category 3

 target += log_sum_exp(logPxy); // sum log probabilities across the scenarios (i.e., marginalize over missingness)
 }

 else { // x is observed

 x_obs[i] ~ categorical(softmax(score)); // likelihood statement for x categorical variable
 y[i] ~ bernoulli_logit(beta1*x_cat_1[i] + beta2*x_cat_2[i] + beta3*x_cat_3[i]); // likelihood statement for outcome
 }

 } // close loop

 } // close model block

 ///////////////////////////////////////////////////////////////////////////////

 generated quantities { // generate estimates of the imputed category for all observations

 matrix[N, K] x_imp; // matrix to contain for all N observations the probability of belonging to category 1:K

 for (i in 1:N) {

 vector[K] score; // vector containing log-odds of being in category 1, 2, or 3
 score[1] = 0; // score[1] is fixed at zero
 score[2] = a_cat2 + b_cat2*cov_for_x[i]; // modeling the log-odds of category 2
 score[3] = a_cat3 + b_cat3*cov_for_x[i]; // modeling the log-odds of category 3

 if (x_miss[i] == 1) { // x is unobserved

 // want probability of the unobserved x value belonging to each category,
 // given the observed y value
 // so with K = 3 we want: Pr(1|y), Pr(2|y), and Pr(3|y)
 // which is equivalent to: Pr(1,y)/Pr(y), Pr(2,y)/Pr(y), and Pr(3,y)/Pr(y)

 vector[K] logPxy; // vector for Pr(1,y), Pr(2,y), and Pr(3,y) values
 real logPy; // Pr(y) value

 // calculate Pr(x,y) values for 1:K

 logPxy[1] = log_softmax(score)[1] + bernoulli_logit_lpmf(y[i] | beta1); // Pr(1,y) = Pr(1)Pr(y|1)
 logPxy[2] = log_softmax(score)[2] + bernoulli_logit_lpmf(y[i] | beta2); // Pr(2,y) = Pr(2)Pr(y|2)
 logPxy[3] = log_softmax(score)[3] + bernoulli_logit_lpmf(y[i] | beta3); // Pr(3,y) = Pr(3)Pr(y|3)

 // calculate Pr(y) (as in the model likelihood statement)

 logPy = log_sum_exp(logPxy); // sum log probabilities across the scenarios (i.e., marginalize over missingness)

 // populate the x_imp matrix row for the ith observation

 x_imp[i, 1] = exp(logPxy[1] - logPy); // Pr(1|y) = Pr(1,y)/Pr(y)
 x_imp[i, 2] = exp(logPxy[2] - logPy); // Pr(2|y) = Pr(2,y)/Pr(y)
 x_imp[i, 3] = exp(logPxy[3] - logPy); // Pr(3|y) = Pr(3,y)/Pr(y)
 } 

 else { // x is observed

 x_imp[i, 1:3] = [0, 0, 0];
 x_imp[i, x_obs[i]] = 1; // when the category has been observed, we know the category
 }

 } // close loop

 } // close model block
 "

 #==============================================================================


 # Fit the data


 fit_model <- 
  stan(model_code = model,
       data = list(N = N, K = K, y = y, x_obs = x_obs,
                   x_miss = x_miss, cov_for_x = cov_for_x),
       iter = 1000, chains = 4, cores = 4,
       control = list(adapt_delta = 0.995, max_treedepth = 15))

 # Examine the model output

 precis(fit_model)
 out <- extract(fit_model)

 # The model is correctly recovering the relationship between category 
 # and outcome...

 logistic(mean(out$beta1)) # should be ~0.8
 logistic(mean(out$beta2)) # should be ~0.3
 logistic(mean(out$beta3)) # should be ~0.1

 # And is correctly recovering the relative probability of belonging to a 
 # category...

 softmax(0, mean(out$a_cat2), mean(out$a_cat3))
 # should be ~0.5, ~0.3, ~0.2

 # Note that if the missingness highly skews the observed x distribution 
 # relative to the true x distribution, I do not think this will be the case...


 # This model is also generating estimates for the probability of missing 
 # values belonging to different x categories

 # Generate a summary of the posterior probabilities for category assignment

 post.prob.means <- apply(out$x_imp, c(2, 3), mean)
 # this is a N x K matrix showing the mean probabilities

 # Show this matrix with only those rows representing missing x data

 post.prob.means[x_miss == 1, ]

 # And show this data along with the relevant y observations

 cbind(post.prob.means[x_miss == 1, ], y[x_miss == 1])
 # this shows that the probabilities of belonging to a category
 # shift depending upon the observed data
	# Categorical missing data in Stan

	# Demonstrated with three categories

	# Code and ideas adapted from @rmcelreath
	# https://gist.github.com/rmcelreath/9406643583a8c99304e459e644762f82

	# Code and ideas discussed with @dmontecino
	# https://gist.github.com/dmontecino/b804853e4b36a57990a7108a35201cf5


	library(rstan)
	library(rethinking) # devtools::install_github("rmcelreath/rethinking")

	#==============================================================================


	# Data setup


	# Note: in this simulated data, categories are unbalanced and the data is
	# missing at random

	N <- 5000
	N_missing <- 100
	K <- 3 # number of categories

	# Generate unordered categorical covariates, with unbalanced observations
	# Category 1 = 50% of simulated data
	# Category 2 = 30% of simulated data
	# Category 3 = 20% of simulated data
	x <- rmultinom(N, size = 1, prob = c(0.5, 0.3, 0.2))
	x <- sapply(1:N, function(i) which(x[ , i] == 1))

	# Simulate bivariate response as a function of the category
	y <- rep(NA, N)
	for (i in 1:N) {

	if (x[i] == 1)
	y[i] = rbinom(n = 1, size = 1, prob = 0.8)

	else if (x[i] == 2)
	y[i] = rbinom(n = 1, size = 1, prob = 0.3)

	else # x = 3
	y[i] = rbinom(n = 1, size = 1, prob = 0.1)
	}

	# Simulate missing data
	i_miss <- sample(1:N, size = N_missing)
	x_obs <- x
	x_obs[i_miss] <- (-1) # placeholder, Stan will not accept NA values
	x_miss <- ifelse(1:N %in% i_miss, 1, 0)

	# Create a covariate for use in prediction of the categorical variable
	# Note: in this case, the covariate does not actually correlate with the
	# category, but the code is such that this could be easily changed
	cov_for_x <- NA
	cov_for_x[x_obs == 1] <-
	rbinom(length(cov_for_x[x_obs == 1]), size = 1, prob = 0.5)
	cov_for_x[x_obs == 2] <-
	rbinom(length(cov_for_x[x_obs == 2]), size = 1, prob = 0.5)
	cov_for_x[x_obs == 3] <-
	rbinom(length(cov_for_x[x_obs == 3]), size = 1, prob = 0.5)
	cov_for_x[is.na(cov_for_x)] <-
	rbinom(length(cov_for_x[is.na(cov_for_x)]), size = 1, prob = 0.5)

	# Create dummy variables for use in the model
	x_cat_1 <- ifelse(x_obs == 1, 1, 0)
	x_cat_2 <- ifelse(x_obs == 2, 1, 0)
	x_cat_3 <- ifelse(x_obs == 3, 1, 0)
	cov_for_x_for_miss_cat <- cov_for_x[x_miss == 1]


	# Examine distribution of full simulated data
	simplehist(x)

	# Examine distribution of simulated data that goes unobserved
	simplehist(x[x_miss == 1])

	# So in this case, the categories are unbalanced, but the unobserved data
	# reflects this same pattern

	#==============================================================================


	# Model definition


	model <- "

	data {

	int N; // number of observations
	int K; // number of categories
	int y[N]; // binary outcome variable
	int x_obs[N]; // observed categorical variable (-1 when unobserved)
	int x_miss[N]; // dummy variable indicating missingness in the categorical variable
	int cov_for_x[N]; // a binary predictor for the x categorical variable
	int x_cat_1[N]; // dummy variable indicating when x has the first level
	int x_cat_2[N]; // dummy variable indicating when x has the second level
	int x_cat_3[N]; // dummy variable indicating when x has the third level
	}

	///////////////////////////////////////////////////////////////////////////////

	parameters {

	real beta1; // coefficient for categorical variable level 1
	real beta2; // coefficient for categorical variable level 2
	real beta3; // coefficient for categorical variable level 3

	real a_cat2; // intercept to model the probability of category 2
	real a_cat3; // intercept to model the probability of category 3
	real b_cat2; // coefficient to model the probability of category 2
	real b_cat3; // coefficient to model the probability of category 3
	}

	///////////////////////////////////////////////////////////////////////////////

	model {

	// priors for all parameters
	beta1 ~ normal(0, 1);
	beta2 ~ normal(0, 1);
	beta3 ~ normal(0, 1);
	a_cat2 ~ normal(0, 1);
	a_cat3 ~ normal(0, 1);
	b_cat2 ~ normal(0, 1);
	b_cat3 ~ normal(0, 1);

	// likelihood
	for (i in 1:N) {

	vector[K] score; // vector containing log-odds of being in category 1, 2, or 3
	score[1] = 0; // score[1] is fixed at zero
	score[2] = a_cat2 + b_cat2*cov_for_x[i]; // modeling the log-odds of category 2
	score[3] = a_cat3 + b_cat3*cov_for_x[i]; // modeling the log-odds of category 3

	if (x_miss[i] == 1) { // x is unobserved

	// model the binary outcome, marginalizing over missingness

	vector[K] logPxy; // vector to hold the log probabilities for each alternate scenario (category 1, 2, or 3)
	// log_softmax is a vector of log probabilities for each x category
	logPxy[1] = log_softmax(score)[1] + bernoulli_logit_lpmf(y[i] \| beta1); // category 1
	logPxy[2] = log_softmax(score)[2] + bernoulli_logit_lpmf(y[i] \| beta2); // category 2
	logPxy[3] = log_softmax(score)[3] + bernoulli_logit_lpmf(y[i] \| beta3); // category 3

	target += log_sum_exp(logPxy); // sum log probabilities across the scenarios (i.e., marginalize over missingness)
	}

	else { // x is observed

	x_obs[i] ~ categorical(softmax(score)); // likelihood statement for x categorical variable
	y[i] ~ bernoulli_logit(beta1x_cat_1[i] + beta2x_cat_2[i] + beta3*x_cat_3[i]); // likelihood statement for outcome
	}

	} // close loop

	} // close model block

	///////////////////////////////////////////////////////////////////////////////

	generated quantities { // generate estimates of the imputed category for all observations

	matrix[N, K] x_imp; // matrix to contain for all N observations the probability of belonging to category 1:K

	for (i in 1:N) {

	vector[K] score; // vector containing log-odds of being in category 1, 2, or 3
	score[1] = 0; // score[1] is fixed at zero
	score[2] = a_cat2 + b_cat2*cov_for_x[i]; // modeling the log-odds of category 2
	score[3] = a_cat3 + b_cat3*cov_for_x[i]; // modeling the log-odds of category 3

	if (x_miss[i] == 1) { // x is unobserved

	// want probability of the unobserved x value belonging to each category,
	// given the observed y value
	// so with K = 3 we want: Pr(1\|y), Pr(2\|y), and Pr(3\|y)
	// which is equivalent to: Pr(1,y)/Pr(y), Pr(2,y)/Pr(y), and Pr(3,y)/Pr(y)

	vector[K] logPxy; // vector for Pr(1,y), Pr(2,y), and Pr(3,y) values
	real logPy; // Pr(y) value

	// calculate Pr(x,y) values for 1:K

	logPxy[1] = log_softmax(score)[1] + bernoulli_logit_lpmf(y[i] \| beta1); // Pr(1,y) = Pr(1)Pr(y\|1)
	logPxy[2] = log_softmax(score)[2] + bernoulli_logit_lpmf(y[i] \| beta2); // Pr(2,y) = Pr(2)Pr(y\|2)
	logPxy[3] = log_softmax(score)[3] + bernoulli_logit_lpmf(y[i] \| beta3); // Pr(3,y) = Pr(3)Pr(y\|3)

	// calculate Pr(y) (as in the model likelihood statement)

	logPy = log_sum_exp(logPxy); // sum log probabilities across the scenarios (i.e., marginalize over missingness)

	// populate the x_imp matrix row for the ith observation

	x_imp[i, 1] = exp(logPxy[1] - logPy); // Pr(1\|y) = Pr(1,y)/Pr(y)
	x_imp[i, 2] = exp(logPxy[2] - logPy); // Pr(2\|y) = Pr(2,y)/Pr(y)
	x_imp[i, 3] = exp(logPxy[3] - logPy); // Pr(3\|y) = Pr(3,y)/Pr(y)
	}

	else { // x is observed

	x_imp[i, 1:3] = [0, 0, 0];
	x_imp[i, x_obs[i]] = 1; // when the category has been observed, we know the category
	}

	} // close loop

	} // close model block
	"

	#==============================================================================


	# Fit the data


	fit_model <-
	stan(model_code = model,
	data = list(N = N, K = K, y = y, x_obs = x_obs,
	x_miss = x_miss, cov_for_x = cov_for_x),
	iter = 1000, chains = 4, cores = 4,
	control = list(adapt_delta = 0.995, max_treedepth = 15))

	# Examine the model output

	precis(fit_model)
	out <- extract(fit_model)

	# The model is correctly recovering the relationship between category
	# and outcome...

	logistic(mean(out$beta1)) # should be ~0.8
	logistic(mean(out$beta2)) # should be ~0.3
	logistic(mean(out$beta3)) # should be ~0.1

	# And is correctly recovering the relative probability of belonging to a
	# category...

	softmax(0, mean(out$a_cat2), mean(out$a_cat3))
	# should be ~0.5, ~0.3, ~0.2

	# Note that if the missingness highly skews the observed x distribution
	# relative to the true x distribution, I do not think this will be the case...


	# This model is also generating estimates for the probability of missing
	# values belonging to different x categories

	# Generate a summary of the posterior probabilities for category assignment

	post.prob.means <- apply(out$x_imp, c(2, 3), mean)
	# this is a N x K matrix showing the mean probabilities

	# Show this matrix with only those rows representing missing x data

	post.prob.means[x_miss == 1, ]

	# And show this data along with the relevant y observations

	cbind(post.prob.means[x_miss == 1, ], y[x_miss == 1])
	# this shows that the probabilities of belonging to a category
	# shift depending upon the observed data