Created
May 19, 2011 17:35
-
-
Save jebyrnes/981292 to your computer and use it in GitHub Desktop.
Simulations of beta regression as compared to other techniques designed for working on response variables on a unit (0,1) scale.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
###### | |
#script to generate simulations comparing the power and type I error rate | |
#of a variety of different methods of analyzing response variables on a unit (0-1) scale | |
# | |
# Inspired by "The Arcsine is Asinine" in a 2011 issue of Ecology | |
# | |
# [email protected] | |
# last updated 5/19/11 | |
# | |
# Changelog: Added visualization code | |
# 5/19/11 Fixed species factor error | |
# 5/19/11 Added censored regression | |
###### | |
library(plyr) | |
library(betareg) | |
library(lmtest) | |
library(car) #for logit transform | |
library(multicore) | |
#source("./betareg.fit2.R") #I whipped up something to use nlminb instead of optim. It was faster, but let's go with optim for now | |
#the function to generate power and error for different methods of analysis | |
#the basic ideas is that, say you have % of something eaten by different species. | |
#we're assuming a basic linear model here, but, you can't have negative amounts eaten or >100% eaten - that's just 0 or 100. | |
sim3 <- function(n.sim=100, sp.slope=c(10,20,30,40), n=10, p.crit=0.05, dens.range=0:10, stdDev=30, mc.cores=1){ | |
dataset<-data.frame(expand.grid(rep(dens.range, n), 1:length(sp.slope))) | |
names(dataset)<-c("density", "species") | |
#get a p value from either an F or Chisq LR test | |
getp<-function(mod1, mod2, type="F"){ | |
if(type=="F") return(anova(mod1, mod2)$"Pr(>F)"[2]) | |
if(type=="ch") return(lrtest(mod1, mod2)$"Pr(>Chisq)"[2]) | |
return(NA) | |
} | |
#create a list with all of the p value tests | |
# cat("goin' in!\n") #debug | |
#use mclapply to run the simulations, this leveraging multi-core computers | |
eval_list<-mclapply(1:n.sim, function(x) { | |
# cat(paste(x, n, "\n", sep=" ")) #so we know things are going... | |
#simulate a response variable for which there is an interaction | |
dataset$y1<-rnorm(n*length(sp.slope)*length(dens.range), dataset$density*sp.slope[dataset$species], stdDev) | |
dataset$y1[which(dataset$y1>100)]<-99.9 #correction to fit in scale | |
dataset$y1[which(dataset$y1<0)]<-0.001 #correction to fit in scale | |
#note, there are other options for correcting the 0 and 100 values, but, I'm using a simple one here | |
#simulate a response variable for which there isn't even a species effect | |
dataset$y2<-rnorm(n*length(sp.slope)*length(dens.range), dataset$density*mean(sp.slope), stdDev) | |
dataset$y2[which(dataset$y2>100)]<-99.9 | |
dataset$y2[which(dataset$y2<0)]<-0.001 | |
dataset$species<-as.factor(dataset$species) | |
return(c( "untrans_power" = getp(lm(y1 ~ density*species, data=dataset), lm(y1 ~ density+species, data=dataset)), | |
"untrans_error" = getp(lm(y2 ~ density*species, data=dataset), lm(y2 ~ density+species, data=dataset)), | |
"arcsin_power" = getp(lm(asin(sqrt(y1/100)) ~ density*species, data=dataset), lm(asin(sqrt(y1/100)) ~ density+species, data=dataset)), | |
"arcsin_error" = getp(lm(asin(sqrt(y2/100)) ~ density*species, data=dataset), lm(asin(sqrt(y2/100)) ~ density+species, data=dataset)), | |
"logit_power" = getp(lm(logit(y1) ~ density*species, data=dataset), lm(logit(y1) ~ density+species, data=dataset)), | |
"logit_error" = getp(lm(logit(y2) ~ density*species, data=dataset), lm(logit(y2) ~ density+species, data=dataset)), | |
"censReg_power" = getp(censReg(y1~ density*species, data=dataset, left=0.001, right=99.9), censReg(y1 ~ density+species, data=dataset, left=0.001, right=99.9), type="ch"), | |
"censReg_error" = getp(censReg(y2 ~ density*species, data=dataset, left=0.001, right=99.9), censReg(y2 ~ density+species, data=dataset, left=0.001, right=99.9), type="ch"), | |
#"glm_power" = getp(glm(y1/100 ~ density*species, data=dataset, family=binomial), glm(y1/100 ~ density+species, data=dataset, family=binomial), type="ch"), | |
#"glm_error" = getp(glm(y2/100 ~ density*species, data=dataset, family=binomial), glm(y2/100 ~ density+species, data=dataset, family=binomial), type="ch"), | |
"betareg_power" = getp(betareg(I(y1/100)~ density*species, data=dataset), betareg(I(y1/100) ~ density+species, data=dataset), type="ch"), | |
"betareg_error" = getp(betareg(I(y2/100) ~ density*species, data=dataset), betareg(I(y2/100) ~ density+species, data=dataset), type="ch") | |
)) | |
}, mc.cores= mc.cores) | |
eval_df<-data.frame(t(eval_list[[1]])) | |
for(i in 2:length(eval_list)) {eval_df<-rbind(eval_df, eval_list[[i]])} | |
ret<-apply(eval_df, 2, function(x) sum((x<=p.crit), na.rm=T)/(n.sim-length(which(is.na(x))))) | |
#colSums(eval_df<=p.crit, na.rm=T)/n.sim | |
return(ret) | |
} | |
#note, this is setup for EOS @ nceas, so...yeah, 10 cores. | |
simDf<-ddply(data.frame(n=c(3,6,9,12,14,16,18,21,24,27)), "n", .progress="text", function(x) sim3(n.sim=1000, n=x$n, mc.cores=10, stdDev=40)) | |
#write.csv(simDf[1:10,], "beta_pow.csv", row.names=F) #plyr kept giving me repeated rows, so, I only need the first 10 | |
####graphing results | |
#simDf<-read.csv("./beta_pow.csv") | |
errDf<-melt(simDf, id.vars="n", measure.vars=c(3,5,7,11)) | |
errDf$variable<-gsub("_error", "", errDf$variable) | |
qplot(n, value, data=errDf, colour=variable, geom=c("point", "line"), ylab="Type I Error Rate\n", xlab="\nSample Size")+theme_bw(base_size=18)+scale_colour_hue(name="Analysis") | |
powDf<-melt(simDf, id.vars="n", measure.vars=c(2,4,6,10)) | |
powDf$variable<-gsub("_power", "", powDf$variable) | |
qplot(n, value, data=powDf, colour=variable, geom=c("point", "line"), ylab="Power\n", xlab="\nSample Size")+theme_bw()+theme_bw(base_size=18)+scale_colour_hue(name="Analysis") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment