mike-lawrence · November 30, 2016 22:18
diff --git a/heavy_partially_pooled.stan b/heavy_partially_pooled.stan
 data {
 	#N: number of cases
 	int N ;
 	#: K: number of outcome variables
 	int K ;
 	#X: matrix of outcomes for each case & variable
 	matrix[N,K] X ;
 	#Y: outcomes
 	vector[N] Y;
 }
 parameters {
 	#intercept:
 	real intercept ;
 	#betas: effects of each predictor
 	vector[K] betas ;
 	#sigma: measurement error
 	real<lower=0> sigma ;
 	#tau: estimate of sd of betas
 	real<lower=0> tau ;
 	#nu: log(df) of betas
 	real<lower=0> nu ;
 }
 model {
 	#priors on population parameters; presumes data has been scale()ed
 	intercept ~ normal(0,1) ;
 	betas ~ student_t(exp(nu),0,tau) ;
 	sigma ~ weibull(2,1) ;
 	tau ~ weibull(2,1) ;
 	nu ~ normal(0,1);
 	#outcomes modelled as normal with mean determined by predictors and noise
 	Y ~ normal( intercept + X*betas  , sigma ) ;
 }
diff --git a/multiple_comparisons.R b/multiple_comparisons.R
 #Load packages ----
 library(rstan)
 rstan_options(auto_write = TRUE)


 #Create some fake data ----

 #K: number of predictors
 K = 1e3

 #N: number of outcome instances observed
 N = K*2

 #set seed for replicability
 set.seed(1)

 #X: matrix of predictors
 X = matrix(
 	rnorm(N*K)
 	, nrow = N
 	, ncol = K
 )

 #Y: outcome is unrelated to any predictor
 Y = rnorm(N)

 #package the data for Stan
 data_for_stan = list(
 	N = N
 	, K = K
 	, X = scale(X)
 	, Y = scale(Y)[,1]
 )


 #Try a frequentist regression ----
 fit = lm( Y ~ X)
 CIs = confint(fit)
 sig = (CIs[,1]>0) | (CIs[,2]<0)
 mean(sig) #false alarm rate, 5%-ish


 #compile & sample the unpooled model ----
 unpooled = rstan::stan_model('unpooled.stan')
 post_unpooled = rstan::sampling(
 	object = unpooled
 	, data = data_for_stan
 	, seed = 1
 	, cores = 4
 	, iter = 500 # 500 takes about 5min
 )

 #compute proportion of betas that exclude zero
 betas = data.frame(summary(post_unpooled,pars='betas',probs=c(.025,.975))$summary)
 betas$sig = (betas$X2.5.>0) | (betas$X97.5.<0)
 mean(betas$sig) #5%-ish


 #compile & sample the partially-pooled model ----
 partially_pooled = rstan::stan_model('partially_pooled.stan')
 post_partially_pooled = rstan::sampling(
 	object = partially_pooled
 	, data = data_for_stan
 	, seed = 1
 	, cores = 4
 	, iter = 500 # 2 takes about 5min
 )

 #compute proportion of betas that exclude zero
 betas = data.frame(summary(post_partially_pooled,pars='betas',probs=c(.025,.975))$summary)
 betas$sig = (betas$X2.5.>0) | (betas$X97.5.<0)
 mean(betas$sig) #0%-ish

 #Add a predictor that does relate to the outcome ----

 #set the random seed again for replicability (just in case)
 set.seed(2)

 #Y: outcome, associated with only the first predictor
 Y = X[,1] + rnorm(N)

 #repackage the data
 data_for_stan = list(
 	N = N
 	, K = K
 	, X = scale(X)
 	, Y = scale(Y)[,1]
 )

 #Try the partially-pooled model again ----
 post_partially_pooled = rstan::sampling(
 	object = partially_pooled
 	, data = data_for_stan
 	, seed = 1
 	, cores = 4
 	, iter = 500 # 500 takes about 3min
 )

 #compute proportion of betas that exclude zero
 betas = data.frame(summary(post_partially_pooled,pars='betas',probs=c(.025,.975))$summary)
 betas$sig = (betas$X2.5.>0) | (betas$X97.5.<0)
 mean(betas$sig[2:nrow(betas)]) #2%-ish


 #Compile & sample the heavy-tailed partially-pooled model ----
 heavy_partially_pooled = rstan::stan_model('heavy_partially_pooled.stan')
 post_heavy_partially_pooled = rstan::sampling(
 	object = heavy_partially_pooled
 	, data = data_for_stan
 	, seed = 3 #chain 3 gets stuck with seed=1
 	, cores = 4
 	, iter = 500 # 500 takes about 3min
 )

 #compute proportion of betas that exclude zero
 betas = data.frame(summary(post_heavy_partially_pooled,pars='betas',probs=c(.025,.975))$summary)
 betas$sig = (betas$X2.5.>0) | (betas$X97.5.<0)
 mean(betas$sig[2:nrow(betas)]) #0%
 betas[1,] #check the beta expected to exclude zero

 #check ESS/rhat for tau & nu
 print(
 	post_heavy_partially_pooled
 	, pars = c('tau','nu')
 	, probs = c(.025,.975)
 	, digits = 2
 )
diff --git a/partially_pooled.stan b/partially_pooled.stan
 data {
 	#N: number of cases
 	int N ;
 	#: K: number of outcome variables
 	int K ;
 	#X: matrix of outcomes for each case & variable
 	matrix[N,K] X ;
 	#Y: outcomes
 	vector[N] Y;
 }
 parameters {
 	#intercept:
 	real intercept ;
 	#betas: effects of each predictor
 	vector[K] betas ;
 	#sigma: measurement error
 	real<lower=0> sigma ;
 	#tau: estimate of sd of betas
 	real<lower=0> tau ;
 }
 model {
 	#priors on population parameters; presumes data has been scale()ed
 	intercept ~ normal(0,1) ;
 	betas ~ normal(0,tau) ;
 	sigma ~ weibull(2,1) ;
 	tau ~ weibull(2,1) ;
 	#outcomes modelled as normal with mean determined by predictors and noise
 	Y ~ normal( intercept + X*betas  , sigma ) ;
 }
diff --git a/unpooled.stan b/unpooled.stan
 data {
 	#N: number of cases
 	int N ;
 	#: K: number of outcome variables
 	int K ;
 	#X: matrix of outcomes for each case & variable
 	matrix[N,K] X ;
 	#Y: outcomes
 	vector[N] Y;
 }
 parameters {
 	#intercept:
 	real intercept ;
 	#betas: effects of each predictor
 	vector[K] betas ;
 	#sigma: measurement error
 	real<lower=0> sigma ;
 }
 model {
 	#priors on population parameters; presumes data has been scale()ed
 	intercept ~ normal(0,1) ;
 	betas ~ normal(0,1) ;
 	sigma ~ weibull(2,1) ;
 	#outcomes modelled as normal with mean determined by predictors and noise
 	Y ~ normal( intercept + X*betas  , sigma ) ;
 }
	data {
	#N: number of cases
	int N ;
	#: K: number of outcome variables
	int K ;
	#X: matrix of outcomes for each case & variable
	matrix[N,K] X ;
	#Y: outcomes
	vector[N] Y;
	}
	parameters {
	#intercept:
	real intercept ;
	#betas: effects of each predictor
	vector[K] betas ;
	#sigma: measurement error
	real<lower=0> sigma ;
	#tau: estimate of sd of betas
	real<lower=0> tau ;
	#nu: log(df) of betas
	real<lower=0> nu ;
	}
	model {
	#priors on population parameters; presumes data has been scale()ed
	intercept ~ normal(0,1) ;
	betas ~ student_t(exp(nu),0,tau) ;
	sigma ~ weibull(2,1) ;
	tau ~ weibull(2,1) ;
	nu ~ normal(0,1);
	#outcomes modelled as normal with mean determined by predictors and noise
	Y ~ normal( intercept + X*betas , sigma ) ;
	}
	#Load packages ----
	library(rstan)
	rstan_options(auto_write = TRUE)


	#Create some fake data ----

	#K: number of predictors
	K = 1e3

	#N: number of outcome instances observed
	N = K*2

	#set seed for replicability
	set.seed(1)

	#X: matrix of predictors
	X = matrix(
	rnorm(N*K)
	, nrow = N
	, ncol = K
	)

	#Y: outcome is unrelated to any predictor
	Y = rnorm(N)

	#package the data for Stan
	data_for_stan = list(
	N = N
	, K = K
	, X = scale(X)
	, Y = scale(Y)[,1]
	)


	#Try a frequentist regression ----
	fit = lm( Y ~ X)
	CIs = confint(fit)
	sig = (CIs[,1]>0) \| (CIs[,2]<0)
	mean(sig) #false alarm rate, 5%-ish


	#compile & sample the unpooled model ----
	unpooled = rstan::stan_model('unpooled.stan')
	post_unpooled = rstan::sampling(
	object = unpooled
	, data = data_for_stan
	, seed = 1
	, cores = 4
	, iter = 500 # 500 takes about 5min
	)

	#compute proportion of betas that exclude zero
	betas = data.frame(summary(post_unpooled,pars='betas',probs=c(.025,.975))$summary)
	betas$sig = (betas$X2.5.>0) \| (betas$X97.5.<0)
	mean(betas$sig) #5%-ish


	#compile & sample the partially-pooled model ----
	partially_pooled = rstan::stan_model('partially_pooled.stan')
	post_partially_pooled = rstan::sampling(
	object = partially_pooled
	, data = data_for_stan
	, seed = 1
	, cores = 4
	, iter = 500 # 2 takes about 5min
	)

	#compute proportion of betas that exclude zero
	betas = data.frame(summary(post_partially_pooled,pars='betas',probs=c(.025,.975))$summary)
	betas$sig = (betas$X2.5.>0) \| (betas$X97.5.<0)
	mean(betas$sig) #0%-ish

	#Add a predictor that does relate to the outcome ----

	#set the random seed again for replicability (just in case)
	set.seed(2)

	#Y: outcome, associated with only the first predictor
	Y = X[,1] + rnorm(N)

	#repackage the data
	data_for_stan = list(
	N = N
	, K = K
	, X = scale(X)
	, Y = scale(Y)[,1]
	)

	#Try the partially-pooled model again ----
	post_partially_pooled = rstan::sampling(
	object = partially_pooled
	, data = data_for_stan
	, seed = 1
	, cores = 4
	, iter = 500 # 500 takes about 3min
	)

	#compute proportion of betas that exclude zero
	betas = data.frame(summary(post_partially_pooled,pars='betas',probs=c(.025,.975))$summary)
	betas$sig = (betas$X2.5.>0) \| (betas$X97.5.<0)
	mean(betas$sig[2:nrow(betas)]) #2%-ish


	#Compile & sample the heavy-tailed partially-pooled model ----
	heavy_partially_pooled = rstan::stan_model('heavy_partially_pooled.stan')
	post_heavy_partially_pooled = rstan::sampling(
	object = heavy_partially_pooled
	, data = data_for_stan
	, seed = 3 #chain 3 gets stuck with seed=1
	, cores = 4
	, iter = 500 # 500 takes about 3min
	)

	#compute proportion of betas that exclude zero
	betas = data.frame(summary(post_heavy_partially_pooled,pars='betas',probs=c(.025,.975))$summary)
	betas$sig = (betas$X2.5.>0) \| (betas$X97.5.<0)
	mean(betas$sig[2:nrow(betas)]) #0%
	betas[1,] #check the beta expected to exclude zero

	#check ESS/rhat for tau & nu
	print(
	post_heavy_partially_pooled
	, pars = c('tau','nu')
	, probs = c(.025,.975)
	, digits = 2
	)