dmontecino · October 20, 2021 17:40
diff --git a/gistfile1.txt b/gistfile1.txt
 ### Categorical missing data in Stan ###
 library(rstan)


 N <- 10000
 N_missing <- 100
 K <- 3 # number of categories
 x <- sample( seq(from=1,to=K), size=N, replace=TRUE ) # unordered cateogrical covariate

 # Simulating bivariate respose as function of a categorcial variable
 y <- rep(NA, N)
 for (i in 1:N) {
  if (x[i] == 1)
    y[i] = rbinom(n = 1, size = 1 , prob = 0.7 )
  
  else if (x[i] == 2)
    y[i] = rbinom(n = 1, size = 1 , prob = 0.4 )
  
  else # x = 3
    y[i] = rbinom(n = 1, size = 1 , prob = 0.1 )
 }

 # Following McElreath (https://gist.github.com/rmcelreath/9406643583a8c99304e459e644762f82), simulate missing
 i_miss <- sample( 1:N , size=N_missing )
 x_obs <- x
 x_obs[i_miss] <- (-1) # placeholder, Stan will not accept NA values
 x_NA <- x_obs
 x_NA[i_miss] <- NA
 x_miss <- ifelse( 1:N %in% i_miss , 1 , 0 )
 cov_for_imp_x<-NA
 cov_for_imp_x[x_obs==1]=rbinom(length(cov_for_imp_x[x_obs==1]), size=1, prob=0.5)
 cov_for_imp_x[x_obs==2]=rbinom(length(cov_for_imp_x[x_obs==2]), size=1, prob=0.5)
 cov_for_imp_x[x_obs==3]=rbinom(length(cov_for_imp_x[x_obs==3]), size=1, prob=0.5)
 cov_for_imp_x[is.na(cov_for_imp_x)]=rbinom(length(cov_for_imp_x[is.na(cov_for_imp_x)]), size=1, prob=0.5)
 #covariate does not predict the reproductive season
 x_cat_2=ifelse(x_obs==2,1,0)
 x_cat_3=ifelse(x_obs==3,1,0)
 cov_for_imp_x_for_miss_cat=cov_for_imp_x[x_miss==1]

 stan_model <- "
 data{
 int N; //number of observations
 int K; //number of categories
 int y[N]; // binary outcome
 int x_obs[N]; // categorical variable when observed
 int x_miss[N]; // categorical variable when unobserved (index with 1's and zero's). 1 if unobserved
 int cov_for_imp_x[N];  // a binary covariate for the prediction of x categorical variable
 int x_cat_2[N]; // dummy variable when x has the second level 
 int x_cat_3[N]; // dummy variable when x has the first level
 }

 parameters{
 real alpha; // coefficient of the binary outcome as a function of the categorical variable level 1 (dummy 1 only zeros)
 real beta1; // coefficient of the binary outcome as a function of the categorical variable level 2 (dummy 2)
 real beta2; // coefficient of the binary outcome as a function of the categorical variable level 3 (dummy 3)
 vector[K] a_imp; // intercept for the imputation model 
 vector[K] b1_imp; // coefficent for the imputation model 
 }


 model{
 // priors
 alpha ~ normal(0,1); // explained above
 beta1 ~ normal(0,1); // explained above
 beta2 ~ normal(0,1); // explained above
 a_imp ~ normal(0,1); // explained above
 b1_imp2 ~ normal(0,1);

 //Data 
 for (i in 1:N) {
 vector[K] p;
 vector[K] theta;

 p[2] = a_imp[2] + b1_imp[2]*cov_for_imp_x[i]; // modeling the prob 2 as a function of the covariate to model the category
 p[3] = a_imp[3] + b1_imp[3]*cov_for_imp_x[i]; // modeling the prob 3 as a function of the covariate to model the category


 // x not missing fit the binary variable as a function of the categories
 if (x_miss[i] == 0) {
 y[i] ~ bernoulli_logit(alpha+
       beta1*x_cat_2[i]+
       beta2*x_cat_3[i]);
       
 theta[1] = 0;
 theta[2] = p[2];
 theta[3] = p[3]; 
 x_obs[i] ~ categorical( softmax(theta));
      
 }

 // x missing model the category and posteiorly model the binary outcome as a function of the imputed category

 else {
 vector[n_cat] lp;  

 theta[1] = 0;
 theta[2] = p[2];
 theta[3] = p[3]; 

 lp[1] = log_softmax(theta)[1] + bernoulli_logit_lpmf( y[i] | alpha  + beta1);
 lp[2] = log_softmax(theta)[2] + bernoulli_logit_lpmf( y[i] | alpha);
 lp[3] = log_softmax(theta)[3] + bernoulli_logit_lpmf( y[i] | alpha  + beta2);

 target += log_sum_exp(lp); 

 }
 }
 }
	### Categorical missing data in Stan ###
	library(rstan)


	N <- 10000
	N_missing <- 100
	K <- 3 # number of categories
	x <- sample( seq(from=1,to=K), size=N, replace=TRUE ) # unordered cateogrical covariate

	# Simulating bivariate respose as function of a categorcial variable
	y <- rep(NA, N)
	for (i in 1:N) {
	if (x[i] == 1)
	y[i] = rbinom(n = 1, size = 1 , prob = 0.7 )

	else if (x[i] == 2)
	y[i] = rbinom(n = 1, size = 1 , prob = 0.4 )

	else # x = 3
	y[i] = rbinom(n = 1, size = 1 , prob = 0.1 )
	}

	# Following McElreath (https://gist.github.com/rmcelreath/9406643583a8c99304e459e644762f82), simulate missing
	i_miss <- sample( 1:N , size=N_missing )
	x_obs <- x
	x_obs[i_miss] <- (-1) # placeholder, Stan will not accept NA values
	x_NA <- x_obs
	x_NA[i_miss] <- NA
	x_miss <- ifelse( 1:N %in% i_miss , 1 , 0 )
	cov_for_imp_x<-NA
	cov_for_imp_x[x_obs==1]=rbinom(length(cov_for_imp_x[x_obs==1]), size=1, prob=0.5)
	cov_for_imp_x[x_obs==2]=rbinom(length(cov_for_imp_x[x_obs==2]), size=1, prob=0.5)
	cov_for_imp_x[x_obs==3]=rbinom(length(cov_for_imp_x[x_obs==3]), size=1, prob=0.5)
	cov_for_imp_x[is.na(cov_for_imp_x)]=rbinom(length(cov_for_imp_x[is.na(cov_for_imp_x)]), size=1, prob=0.5)
	#covariate does not predict the reproductive season
	x_cat_2=ifelse(x_obs==2,1,0)
	x_cat_3=ifelse(x_obs==3,1,0)
	cov_for_imp_x_for_miss_cat=cov_for_imp_x[x_miss==1]

	stan_model <- "
	data{
	int N; //number of observations
	int K; //number of categories
	int y[N]; // binary outcome
	int x_obs[N]; // categorical variable when observed
	int x_miss[N]; // categorical variable when unobserved (index with 1's and zero's). 1 if unobserved
	int cov_for_imp_x[N]; // a binary covariate for the prediction of x categorical variable
	int x_cat_2[N]; // dummy variable when x has the second level
	int x_cat_3[N]; // dummy variable when x has the first level
	}

	parameters{
	real alpha; // coefficient of the binary outcome as a function of the categorical variable level 1 (dummy 1 only zeros)
	real beta1; // coefficient of the binary outcome as a function of the categorical variable level 2 (dummy 2)
	real beta2; // coefficient of the binary outcome as a function of the categorical variable level 3 (dummy 3)
	vector[K] a_imp; // intercept for the imputation model
	vector[K] b1_imp; // coefficent for the imputation model
	}


	model{
	// priors
	alpha ~ normal(0,1); // explained above
	beta1 ~ normal(0,1); // explained above
	beta2 ~ normal(0,1); // explained above
	a_imp ~ normal(0,1); // explained above
	b1_imp2 ~ normal(0,1);

	//Data
	for (i in 1:N) {
	vector[K] p;
	vector[K] theta;

	p[2] = a_imp[2] + b1_imp[2]*cov_for_imp_x[i]; // modeling the prob 2 as a function of the covariate to model the category
	p[3] = a_imp[3] + b1_imp[3]*cov_for_imp_x[i]; // modeling the prob 3 as a function of the covariate to model the category


	// x not missing fit the binary variable as a function of the categories
	if (x_miss[i] == 0) {
	y[i] ~ bernoulli_logit(alpha+
	beta1*x_cat_2[i]+
	beta2*x_cat_3[i]);

	theta[1] = 0;
	theta[2] = p[2];
	theta[3] = p[3];
	x_obs[i] ~ categorical( softmax(theta));

	}

	// x missing model the category and posteiorly model the binary outcome as a function of the imputed category

	else {
	vector[n_cat] lp;

	theta[1] = 0;
	theta[2] = p[2];
	theta[3] = p[3];

	lp[1] = log_softmax(theta)[1] + bernoulli_logit_lpmf( y[i] \| alpha + beta1);
	lp[2] = log_softmax(theta)[2] + bernoulli_logit_lpmf( y[i] \| alpha);
	lp[3] = log_softmax(theta)[3] + bernoulli_logit_lpmf( y[i] \| alpha + beta2);

	target += log_sum_exp(lp);

	}
	}
	}
No results found