Created
January 31, 2018 06:49
-
-
Save adamwespiser/1cad89738fbf1ea1553ed7fe77869070 to your computer and use it in GitHub Desktop.
Bayesian Log Reg: What is the difference between the expected pr(y==1| model) and pr(pr==1| model)[calc w/ expectation of beta] ?
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(rstan) | |
library(dplyr) | |
options(mc.cores = parallel::detectCores()) | |
rstan_options(auto_write = TRUE) | |
seed_sim = 1234 | |
set.seed(seed_sim) | |
# set up data | |
N = 4000 | |
K = 15 | |
X = matrix(rnorm(N * K), nrow = N, ncol = K) | |
beta_forward = rnorm(n = K, mean = 0, sd = 1) | |
alpha_forward = 1 | |
z = (X %*% beta_forward) %>% as.numeric() + alpha_forward | |
inv_logit = function (s){exp(s)/(1 + exp(s))} | |
pr = inv_logit(z) | |
y = rbinom(N, 1, prob = pr) | |
# quick sanity check on params using glm w/ binomial link | |
df = data.frame(y = y, x1 = X[,1], x2 = X[,2]) | |
glm( y ~ x1 + x2, data = df, family = "binomial") | |
# stan code: student_t for coefficients, centered | |
stan_code = " | |
data { | |
int<lower=0> N; | |
int<lower=0> K; | |
matrix[N,K] X; | |
int<lower=0,upper=1> y[N]; | |
} | |
parameters { | |
real a; | |
real b_shape; | |
real b_location; | |
vector[K] b; | |
} | |
transformed parameters { | |
vector[N] pr_y; | |
pr_y = inv_logit(X * b + a); | |
} | |
model { | |
a ~ normal(1,1); | |
b_location ~ normal(0,1); | |
b_shape ~ gamma(0.1, 0.1); | |
to_vector(b) ~ normal(b_location, b_shape); | |
y ~ binomial(1,pr_y); | |
} | |
" | |
# run STAN w/ a little larger adaptive delta | |
nchains = 1 | |
stanfit = stan(model_code = stan_code, | |
data = list(X = X, y = y, K = K, N = N), | |
control = list(adapt_delta = 0.95, | |
max_treedepth = 15), | |
chains = nchains, | |
seed = seed_sim ) | |
# get model summary | |
mod_summary <- as.data.frame(summary(stanfit)[[1]]) | |
# extract simulated variables | |
b_sim = extract(stanfit, "b")$b | |
a_sim = extract(stanfit, "a")$a | |
pr_y_sim = extract(stanfit, "pr_y")$pr_y | |
# demonstrate that simulated values are equal to mean | |
n_subsample = 241 | |
pr_y_model_mean = mod_summary[grep("pr_y", rownames(mod_summary)), c("mean")] | |
pry = sapply(seq_len(n_subsample), function(n) | |
sapply(seq_len(nchains * 1000), | |
function(i)inv_logit(sum((b_sim[i,] * X[n,])) + a_sim[i] )) %>% mean(., na.rm = TRUE)) | |
(pry - pr_y_model_mean[seq_len(n_subsample)]) %>% abs %>% sum() | |
# AUC sanity check that model is predictive | |
library(ROCR);AUC = performance(prediction(pr_y_model_mean, y),"auc")@y.values[[1]] | |
print(paste("AUC: ", AUC)) | |
## Now for our experiment | |
# grab the mean values, as summarised | |
b_mean = mod_summary[grep("^b\\[", rownames(mod_summary)), c("mean")] | |
a_mean = mod_summary[grep("^a", rownames(mod_summary)), c('mean')] | |
# run a vectorized calculation | |
pr_y_from_b_mean = inv_logit(as.numeric(X %*% b_mean + a_mean)) | |
## some formatting.... | |
pr_y_model_mean_fmt= format(pr_y_model_mean, digits = 4,scientific = FALSE) | |
pr_y_from_b_mean_fmt = format(pr_y_from_b_mean, digits = 4, scientific = FALSE) | |
out_df = data.frame(y = y, | |
prob_stan = pr_y_model_mean_fmt, | |
prob_mean = pr_y_from_b_mean_fmt) | |
if ("1" == Sys.getenv("RSTUDIO")){ | |
View(out_df) | |
} else { | |
out_df %>% head() | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment