adamwespiser · January 31, 2018 06:49
diff --git a/log-reg-posterior.R b/log-reg-posterior.R
 library(rstan)
 library(dplyr)
 options(mc.cores = parallel::detectCores())
 rstan_options(auto_write = TRUE)

 seed_sim = 1234
 set.seed(seed_sim)

 # set up data
 N = 4000
 K =  15
 X =  matrix(rnorm(N * K), nrow = N, ncol = K)
 beta_forward = rnorm(n = K, mean = 0, sd = 1)
 alpha_forward = 1
 z = (X %*% beta_forward) %>% as.numeric() + alpha_forward
 inv_logit = function (s){exp(s)/(1 + exp(s))}
 pr = inv_logit(z)
 y = rbinom(N, 1, prob = pr)

 # quick sanity check on params using glm w/ binomial link
 df = data.frame(y = y, x1 = X[,1], x2 = X[,2])
 glm( y ~ x1 + x2, data = df, family = "binomial")

 # stan code: student_t for coefficients, centered
 stan_code = "
 data {
  int<lower=0>         N;
  int<lower=0>         K;
  matrix[N,K]          X;
  int<lower=0,upper=1> y[N];
 }
 parameters {
  real      a; 
  real      b_shape;
  real      b_location;
  vector[K] b; 
 }
 transformed parameters {
  vector[N] pr_y;
  pr_y  = inv_logit(X * b + a);
 }
 model {
  a            ~ normal(1,1);

  b_location   ~ normal(0,1);
  b_shape      ~ gamma(0.1, 0.1);
  to_vector(b) ~ normal(b_location, b_shape);

  y            ~ binomial(1,pr_y);
 }

 "

 # run STAN w/ a little larger adaptive delta
 nchains = 1
 stanfit = stan(model_code = stan_code, 
               data = list(X = X, y = y, K = K, N = N),
               control = list(adapt_delta = 0.95,
                              max_treedepth = 15),
               chains = nchains,
               seed = seed_sim )



 # get model summary
 mod_summary <- as.data.frame(summary(stanfit)[[1]])

 # extract simulated variables
 b_sim = extract(stanfit, "b")$b
 a_sim = extract(stanfit, "a")$a
 pr_y_sim = extract(stanfit, "pr_y")$pr_y
 # demonstrate that simulated values are equal to mean
 n_subsample = 241
 pr_y_model_mean = mod_summary[grep("pr_y", rownames(mod_summary)), c("mean")]
 pry = sapply(seq_len(n_subsample), function(n)
  sapply(seq_len(nchains * 1000),
       function(i)inv_logit(sum((b_sim[i,] * X[n,]))  + a_sim[i] )) %>% mean(., na.rm = TRUE))
 (pry - pr_y_model_mean[seq_len(n_subsample)]) %>% abs %>% sum()

 # AUC sanity check that model is predictive 
 library(ROCR);AUC = performance(prediction(pr_y_model_mean, y),"auc")@y.values[[1]]
 print(paste("AUC: ", AUC))


 ## Now for our experiment
 # grab the mean values, as summarised 
 b_mean = mod_summary[grep("^b\\[", rownames(mod_summary)), c("mean")]
 a_mean = mod_summary[grep("^a", rownames(mod_summary)), c('mean')]
 # run a vectorized calculation 
 pr_y_from_b_mean = inv_logit(as.numeric(X %*% b_mean + a_mean))
 ## some formatting....
 pr_y_model_mean_fmt= format(pr_y_model_mean, digits = 4,scientific = FALSE)
 pr_y_from_b_mean_fmt = format(pr_y_from_b_mean, digits = 4, scientific = FALSE)
 out_df = data.frame(y = y, 
                    prob_stan = pr_y_model_mean_fmt, 
                    prob_mean = pr_y_from_b_mean_fmt)
 if ("1" == Sys.getenv("RSTUDIO")){
  View(out_df)
 } else {
  out_df %>% head()
 }
	library(rstan)
	library(dplyr)
	options(mc.cores = parallel::detectCores())
	rstan_options(auto_write = TRUE)

	seed_sim = 1234
	set.seed(seed_sim)

	# set up data
	N = 4000
	K = 15
	X = matrix(rnorm(N * K), nrow = N, ncol = K)
	beta_forward = rnorm(n = K, mean = 0, sd = 1)
	alpha_forward = 1
	z = (X %*% beta_forward) %>% as.numeric() + alpha_forward
	inv_logit = function (s){exp(s)/(1 + exp(s))}
	pr = inv_logit(z)
	y = rbinom(N, 1, prob = pr)

	# quick sanity check on params using glm w/ binomial link
	df = data.frame(y = y, x1 = X[,1], x2 = X[,2])
	glm( y ~ x1 + x2, data = df, family = "binomial")

	# stan code: student_t for coefficients, centered
	stan_code = "
	data {
	int<lower=0> N;
	int<lower=0> K;
	matrix[N,K] X;
	int<lower=0,upper=1> y[N];
	}
	parameters {
	real a;
	real b_shape;
	real b_location;
	vector[K] b;
	}
	transformed parameters {
	vector[N] pr_y;
	pr_y = inv_logit(X * b + a);
	}
	model {
	a ~ normal(1,1);

	b_location ~ normal(0,1);
	b_shape ~ gamma(0.1, 0.1);
	to_vector(b) ~ normal(b_location, b_shape);

	y ~ binomial(1,pr_y);
	}

	"

	# run STAN w/ a little larger adaptive delta
	nchains = 1
	stanfit = stan(model_code = stan_code,
	data = list(X = X, y = y, K = K, N = N),
	control = list(adapt_delta = 0.95,
	max_treedepth = 15),
	chains = nchains,
	seed = seed_sim )



	# get model summary
	mod_summary <- as.data.frame(summary(stanfit)[[1]])

	# extract simulated variables
	b_sim = extract(stanfit, "b")$b
	a_sim = extract(stanfit, "a")$a
	pr_y_sim = extract(stanfit, "pr_y")$pr_y
	# demonstrate that simulated values are equal to mean
	n_subsample = 241
	pr_y_model_mean = mod_summary[grep("pr_y", rownames(mod_summary)), c("mean")]
	pry = sapply(seq_len(n_subsample), function(n)
	sapply(seq_len(nchains * 1000),
	function(i)inv_logit(sum((b_sim[i,] * X[n,])) + a_sim[i] )) %>% mean(., na.rm = TRUE))
	(pry - pr_y_model_mean[seq_len(n_subsample)]) %>% abs %>% sum()

	# AUC sanity check that model is predictive
	library(ROCR);AUC = performance(prediction(pr_y_model_mean, y),"auc")@y.values[[1]]
	print(paste("AUC: ", AUC))


	## Now for our experiment
	# grab the mean values, as summarised
	b_mean = mod_summary[grep("^b\\[", rownames(mod_summary)), c("mean")]
	a_mean = mod_summary[grep("^a", rownames(mod_summary)), c('mean')]
	# run a vectorized calculation
	pr_y_from_b_mean = inv_logit(as.numeric(X %*% b_mean + a_mean))
	## some formatting....
	pr_y_model_mean_fmt= format(pr_y_model_mean, digits = 4,scientific = FALSE)
	pr_y_from_b_mean_fmt = format(pr_y_from_b_mean, digits = 4, scientific = FALSE)
	out_df = data.frame(y = y,
	prob_stan = pr_y_model_mean_fmt,
	prob_mean = pr_y_from_b_mean_fmt)
	if ("1" == Sys.getenv("RSTUDIO")){
	View(out_df)
	} else {
	out_df %>% head()
	}