dmarx · August 23, 2018 21:34
diff --git a/anova_power_analysis.R b/anova_power_analysis.R
 library(tidyr) # just for gather()

 simulate <- function(formulas
        ,obs_per_cell = 100
        ,n_cells_per_group = 3
        ,amp_mu = 50
        ,amp_sd = 1 # this is only thing that impacts sd
        ,true_effect = 3
        ,error_sd = 1
        ){
    cell_mu = rnorm(2*n_cells_per_group, amp_mu, amp_sd)
    effect = rnorm(n_cells_per_group, true_effect, error_sd)
    cell_mu[1:n_cells_per_group] = cell_mu[1:n_cells_per_group] + effect

    X = sapply(cell_mu, function(mu) rnorm(obs_per_cell, mu))
    X = gather(data.frame(X))
    colnames(X) = c('cell_id', 'amp')

    X$healthy = 0
    X[X$cell_id %in% paste0('X',1:n_cells_per_group), 'healthy'] = 1
        
    measured_effect = rep(0, length(formulas))
    effect_significance = rep(0, length(formulas))
    for (i in 1:length(formulas)){
        mod = lm(formulas[i], X)
        measured_effect[i] = summary(mod)$coefficients['healthy', 'Estimate']
        effect_significance[i] = summary(mod)$coefficients['healthy', 'Pr(>|t|)']
    }
    list(measured_effect=measured_effect, effect_significance=effect_significance)
 }

 ##################################################################################
               
 formulas = c('amp ~ healthy + cell_id', 
             'amp ~ healthy')
 n_reps = 2e3
 v = replicate(n_reps, simulate(formulas, amp_sd=1, error_sd=10))

 ##################################################################################

 effect = t(matrix(unlist(v['measured_effect',]),nrow=2))#[1:3,]
 apply(effect, 2, mean)
 apply(effect, 2, sd)

 pvals = t(matrix(unlist(v['effect_significance',]),nrow=2))#[1:3,]
 apply(pvals, 2, mean)
 apply(pvals, 2, sd)
 apply(pvals, 2, function(p) mean(p<1/n_reps) )

 # Both models find correct mean, simpler model's estimates have lower variance (generally about half that of more complex model). 
 # When amp_sd or error_sd are large, more complex model is more likely to return significant results. 
 # My interpretation: more complex model is less accurate and overconfident. 
 # This is likely symptomatic of the cell_id coefficients adding unnecessary degrees of freedom.
	library(tidyr) # just for gather()

	simulate <- function(formulas
	,obs_per_cell = 100
	,n_cells_per_group = 3
	,amp_mu = 50
	,amp_sd = 1 # this is only thing that impacts sd
	,true_effect = 3
	,error_sd = 1
	){
	cell_mu = rnorm(2*n_cells_per_group, amp_mu, amp_sd)
	effect = rnorm(n_cells_per_group, true_effect, error_sd)
	cell_mu[1:n_cells_per_group] = cell_mu[1:n_cells_per_group] + effect

	X = sapply(cell_mu, function(mu) rnorm(obs_per_cell, mu))
	X = gather(data.frame(X))
	colnames(X) = c('cell_id', 'amp')

	X$healthy = 0
	X[X$cell_id %in% paste0('X',1:n_cells_per_group), 'healthy'] = 1

	measured_effect = rep(0, length(formulas))
	effect_significance = rep(0, length(formulas))
	for (i in 1:length(formulas)){
	mod = lm(formulas[i], X)
	measured_effect[i] = summary(mod)$coefficients['healthy', 'Estimate']
	effect_significance[i] = summary(mod)$coefficients['healthy', 'Pr(>\|t\|)']
	}
	list(measured_effect=measured_effect, effect_significance=effect_significance)
	}

	##################################################################################

	formulas = c('amp ~ healthy + cell_id',
	'amp ~ healthy')
	n_reps = 2e3
	v = replicate(n_reps, simulate(formulas, amp_sd=1, error_sd=10))

	##################################################################################

	effect = t(matrix(unlist(v['measured_effect',]),nrow=2))#[1:3,]
	apply(effect, 2, mean)
	apply(effect, 2, sd)

	pvals = t(matrix(unlist(v['effect_significance',]),nrow=2))#[1:3,]
	apply(pvals, 2, mean)
	apply(pvals, 2, sd)
	apply(pvals, 2, function(p) mean(p<1/n_reps) )

	# Both models find correct mean, simpler model's estimates have lower variance (generally about half that of more complex model).
	# When amp_sd or error_sd are large, more complex model is more likely to return significant results.
	# My interpretation: more complex model is less accurate and overconfident.
	# This is likely symptomatic of the cell_id coefficients adding unnecessary degrees of freedom.
No results found