Last active
October 20, 2021 17:40
-
-
Save dmontecino/b804853e4b36a57990a7108a35201cf5 to your computer and use it in GitHub Desktop.
STAN Imputation of a categorical covariate when non-observed and modeling of a binary outcome
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
### Categorical missing data in Stan ### | |
library(rstan) | |
N <- 10000 | |
N_missing <- 100 | |
K <- 3 # number of categories | |
x <- sample( seq(from=1,to=K), size=N, replace=TRUE ) # unordered cateogrical covariate | |
# Simulating bivariate respose as function of a categorcial variable | |
y <- rep(NA, N) | |
for (i in 1:N) { | |
if (x[i] == 1) | |
y[i] = rbinom(n = 1, size = 1 , prob = 0.7 ) | |
else if (x[i] == 2) | |
y[i] = rbinom(n = 1, size = 1 , prob = 0.4 ) | |
else # x = 3 | |
y[i] = rbinom(n = 1, size = 1 , prob = 0.1 ) | |
} | |
# Following McElreath (https://gist.github.com/rmcelreath/9406643583a8c99304e459e644762f82), simulate missing | |
i_miss <- sample( 1:N , size=N_missing ) | |
x_obs <- x | |
x_obs[i_miss] <- (-1) # placeholder, Stan will not accept NA values | |
x_NA <- x_obs | |
x_NA[i_miss] <- NA | |
x_miss <- ifelse( 1:N %in% i_miss , 1 , 0 ) | |
cov_for_imp_x<-NA | |
cov_for_imp_x[x_obs==1]=rbinom(length(cov_for_imp_x[x_obs==1]), size=1, prob=0.5) | |
cov_for_imp_x[x_obs==2]=rbinom(length(cov_for_imp_x[x_obs==2]), size=1, prob=0.5) | |
cov_for_imp_x[x_obs==3]=rbinom(length(cov_for_imp_x[x_obs==3]), size=1, prob=0.5) | |
cov_for_imp_x[is.na(cov_for_imp_x)]=rbinom(length(cov_for_imp_x[is.na(cov_for_imp_x)]), size=1, prob=0.5) | |
#covariate does not predict the reproductive season | |
x_cat_2=ifelse(x_obs==2,1,0) | |
x_cat_3=ifelse(x_obs==3,1,0) | |
cov_for_imp_x_for_miss_cat=cov_for_imp_x[x_miss==1] | |
stan_model <- " | |
data{ | |
int N; //number of observations | |
int K; //number of categories | |
int y[N]; // binary outcome | |
int x_obs[N]; // categorical variable when observed | |
int x_miss[N]; // categorical variable when unobserved (index with 1's and zero's). 1 if unobserved | |
int cov_for_imp_x[N]; // a binary covariate for the prediction of x categorical variable | |
int x_cat_2[N]; // dummy variable when x has the second level | |
int x_cat_3[N]; // dummy variable when x has the first level | |
} | |
parameters{ | |
real alpha; // coefficient of the binary outcome as a function of the categorical variable level 1 (dummy 1 only zeros) | |
real beta1; // coefficient of the binary outcome as a function of the categorical variable level 2 (dummy 2) | |
real beta2; // coefficient of the binary outcome as a function of the categorical variable level 3 (dummy 3) | |
vector[K] a_imp; // intercept for the imputation model | |
vector[K] b1_imp; // coefficent for the imputation model | |
} | |
model{ | |
// priors | |
alpha ~ normal(0,1); // explained above | |
beta1 ~ normal(0,1); // explained above | |
beta2 ~ normal(0,1); // explained above | |
a_imp ~ normal(0,1); // explained above | |
b1_imp2 ~ normal(0,1); | |
//Data | |
for (i in 1:N) { | |
vector[K] p; | |
vector[K] theta; | |
p[2] = a_imp[2] + b1_imp[2]*cov_for_imp_x[i]; // modeling the prob 2 as a function of the covariate to model the category | |
p[3] = a_imp[3] + b1_imp[3]*cov_for_imp_x[i]; // modeling the prob 3 as a function of the covariate to model the category | |
// x not missing fit the binary variable as a function of the categories | |
if (x_miss[i] == 0) { | |
y[i] ~ bernoulli_logit(alpha+ | |
beta1*x_cat_2[i]+ | |
beta2*x_cat_3[i]); | |
theta[1] = 0; | |
theta[2] = p[2]; | |
theta[3] = p[3]; | |
x_obs[i] ~ categorical( softmax(theta)); | |
} | |
// x missing model the category and posteiorly model the binary outcome as a function of the imputed category | |
else { | |
vector[n_cat] lp; | |
theta[1] = 0; | |
theta[2] = p[2]; | |
theta[3] = p[3]; | |
lp[1] = log_softmax(theta)[1] + bernoulli_logit_lpmf( y[i] | alpha + beta1); | |
lp[2] = log_softmax(theta)[2] + bernoulli_logit_lpmf( y[i] | alpha); | |
lp[3] = log_softmax(theta)[3] + bernoulli_logit_lpmf( y[i] | alpha + beta2); | |
target += log_sum_exp(lp); | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment