Created
November 16, 2011 01:34
-
-
Save aammd/1368990 to your computer and use it in GitHub Desktop.
howtokillaforloop
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# by Andrew MacDonald Nov 2011 | |
## please add your own tips and tricks! | |
## the idea here is to demonstrate how for loops are used | |
## and then immediately demonstrate how they can be avoided. | |
## we'll begin by creating a fake dataset. | |
## This experiment is a 3x3 factorial with 2 replicates of each treatment combination. | |
## The scientist applied nitrogen and phosphorous to the soil | |
## then she measured two responses: plant growth and seed production. | |
## both responses increase with nutrient addition. There's no interaction. | |
#################################################################################################### | |
###################### simulating the dataset ###################################################### | |
#################################################################################################### | |
## a vector of treatment levels | |
treatments <- c("low","med","high") | |
## generate a dataframe with two factors. | |
## rep makes copies of the 'treatment' vector. for the factor 'nitrogen', the whole vector 'treatments' is repeated six times | |
## while for phosp, each element of 'treatments' is repeated six times. the result is two fully-crossed factors, with every combination repeated twice (ie n=2 for each treatment combination) | |
enrich <- data.frame(nitrogen=factor(rep(treatments,6),levels=c("low","med","high")), | |
phosp=factor(rep(treatments,rep(6,3)),levels=c("low","med","high")) | |
) | |
## to generate response variables, we just make up an equation that describes how the mean of a distribution | |
## depends on the values of our treatments (ie the numerical vectors in the two lines below are just my imagination) | |
## R has the ability to generate random numbers from all the common statistical distributions. | |
## to generate data with a plausible amount of noise, fill in these equations for the value of the means | |
enrich$growth<-with(enrich,rnorm(n=dim(enrich)[1],mean=c(2,3,4)[nitrogen]+c(3,5,7)[phosp])) | |
## while the normal distribution might describe growth (continuous), the poisson will make plausible seed counts (discrete) | |
enrich$seeds <- with(enrich,rpois(n=dim(enrich)[1],lambda=c(15,20,25)[nitrogen]+c(20,30,40)[phosp])) | |
#################################################################################################### | |
########################## operations on factor levels ############################################# | |
#################################################################################################### | |
## QUESTION: what is the mean growth of plants in the different phosphorus treatments? | |
## with a (totally needless!) for-loop | |
x<-numeric(0) | |
for (i in levels(enrich$phosp)) { | |
x[i]<-mean(enrich$growth[which(enrich$phosp==i)]) | |
} | |
x | |
## TAPPLY is useful: it finds all the values of a response variable that match the levels of a factor | |
## then it runs a function of your choice on those values. | |
## the function can be a standard on (mean, sd) or one you make up. | |
with(enrich,tapply(growth,phosp,mean)) | |
with(enrich,tapply(seeds,phosp,mean)) | |
## tapply can do more than one factor at once: | |
with(enrich,tapply(growth,list(phosp,nitrogen),mean)) | |
## confirm there are two replicates in each group | |
with(enrich,tapply(growth,list(phosp,nitrogen),length)) | |
## AGGREGATE is an extremely powerful function. | |
## Like tapply it can work with as many factors as you want, and can run any function you like | |
## Unlike tapply, it can handle multiple response variables and produces a dataframe, not a matrix. | |
## take the average of all replicates in the same treatment combination: | |
with(enrich,aggregate(enrich[,3:4],by=list(p=phosp,n=nitrogen),mean)) | |
#################################################################################################### | |
############################# operations on columns and rows ####################################### | |
#################################################################################################### | |
## QUESTION: what is the total growth in each treatment level? | |
## lets begin with the result of tapply as above | |
growth.resp <- with(enrich,tapply(growth,list(phosp,nitrogen),mean)) | |
## for-loop | |
x<-numeric(0) | |
for (i in 1:dim(growth.resp)[2]){ | |
x[i] <- sum(growth.resp[,i]) | |
} | |
x | |
## The above can be done in a single function: | |
apply(growth.resp,2,sum) | |
## colSums is much faster, in addition to being easier to read! | |
colSums(growth.resp) | |
## all three of the techniques above could be done on rows as well. I leave this as an exercise for the reader (sorry to be so smug; I've just always wanted to say that!) | |
#################################################################################################### | |
###################### operations on lists ######################################################### | |
#################################################################################################### | |
## you can begin to save a great deal of time by using lists. | |
## for example, you can collect similar objects in one list and then apply a function to each with a single line of code. | |
# plot each response variable as a function of a fertilizer treatment | |
par(mfrow=c(1,2)) | |
for (i in 3:4){ | |
with(enrich,plot(x=phosp,y=enrich[,i],main=names(enrich)[i],xlab="phosp")) | |
} | |
## lapply becomes very flexible when you write your own special function for it to run. | |
## this draws histograms of each variable | |
par(mfrow=c(1,2)) | |
lapply(enrich[,3:4],hist,col='green') #notice how additional arguments can be applied. | |
## this reproduces our for-loop from above... | |
lapply(enrich[,3:4],function(resp) plot(x=enrich$phosp,y=resp)) | |
## ... and the next line does even better! Here we are running lapply on the *names*, not the actual data. | |
## this is nicer because it lets us use the name itself, for example to label the plots with the name of the response variable. | |
## remember: a dataframe is a kind of list! | |
boxplots <- lapply(names(enrich)[3:4],function(resp) plot(cbind(enrich["phosp"],enrich[resp]),main=resp)) | |
## the function could become as complex as you like; in which case you should probably define it as a function | |
## outside of lapply - i.e., give it a nice name. | |
## Question CLOSED (THANKS LAURA!!) these are equivalent ways of making it work. | |
lapply(names(enrich)[3:4],function(resp) plot(x=enrich["phosp"],y=enrich[[resp]],main=resp)) | |
lapply(names(enrich)[3:4],function(resp) plot(enrich[,c("phosp",resp)],main=resp)) | |
## notice that "plot" actually produces some information (in this case about the breakpoints used to make the boxplots etc) | |
## by saving this information to a list, we can save it for easy extraction later: | |
sapply(boxplots,function(x) x["conf"]) | |
## lapply and sapply are the same, except that sapply "S"implifies the resulting list into a vector or array | |
## sometimes this makes no sense; other times it is convenient: | |
lapply(enrich[,3:4],mean) #returns a list | |
sapply(enrich[,3:4],mean) # a vector |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment