aaronsaunders · August 8, 2013 10:47 · ejanalysis · Jul 24, 2014
diff --git a/aggregate.R b/aggregate.R
 #
 # Steve Pittard - [email protected], 03/19/12
 # Code to illustrate motivations for using apply function
 #
 # See www.bimcore.emory.edu/bbseries for slides and code downloads
 #
 # References include:
 # http://statland.org/R/R/Rpulse2.htm , http://www.cyclismo.org/tutorial/R/tables.html#manipulations
 # http://nsaunders.wordpress.com/2010/08/20/a-brief-introduction-to-apply-in-r/
 # 

 data(mtcars)
 ?mtcars

 by(mtcars$mpg,    list(am=mtcars$am, cyl=mtcars$cyl), mean)
 by(mtcars[, c('hp','wt')], list(am=mtcars$am), mean)
 by(mtcars['mpg'], list(am=mtcars$am, cyl=mtcars$cyl), mean)

 tapply(mtcars$mpg, mtcars$am,mean)
 tapply(mtcars$mpg, list(mtcars$am,mtcars$cyl), mean)

 ###
 # Let's talk about aggregation now that we know something about the apply functions
 ###

 # Simple tables
 letters[1:10]                   # Built in letters char vector
 my.sample <- sample(letters[1:10], 20, replace=TRUE)   # sample 20 times from the 1st 10 letters with replacement
 table(my.sample)   # Counts frequency of letters

 # The table is much simpler than writing a for loop
 # We will now use the internal mtcars data set - assumes some factors are present
 my.mtcars <- transform(mtcars, am=factor(am, labels=c("auto","manual")))
 my.table  <- table(my.mtcars$cyl,my.mtcars$am)

 # We can even add in another group - Let's try number of gears
 my.3d.table <- table(my.mtcars$cyl, my.mtcars$am, my.mtcars$gear)
 my.3d.table

 # Here we create some categories on the fly using cut
 with(my.mtcars, table(cut(mpg, quantile(mpg)), cyl))
 with(my.mtcars, table(cut(mpg, quantile(mpg), labels=c("Big Gas Hog","Gas Hog","Not Bad","The Best")), 
                          cyl))

 # The xtabs function also summarizes things. It uses a formula notation
 xtabs(~SNP1 + SNP2, new.snps)
 xtabs(~am + cyl, my.mtcars)
 xtabs(~am + cyl + gear, my.mtcars)

 class(my.table)
 my.table

 # Tables are easily visualized
 barplot(my.table, legend=TRUE, col=c("red","blue","green"), ylim=c(0,20))
 plot(my.table, col=c("red","blue","green"), main="Mosaic Plot")

 # You can see the margin totals
 addmargins(my.table)
 addmargins(my.table, 1)
 addmargins(my.table, 2)

 # Working with table proportions
 addmargins(prop.table(my.table))
 addmargins(prop.table(my.table, 2))  # Look at proportions columnwise
 addmargins(prop.table(my.table, 1))  # Look at proportions row wise

 ##
 # Let's explore the aggregate function
 ##
 # The aggregate function in this case is "mean" but could be something else

 aggregate(mtcars['mpg'], list(Transmission=mtcars$am),mean)
 aggregate(mtcars[c('mpg','hp')], list(Transmission_Type=mtcars$am), mean)
 aggregate(mtcars[c('mpg','hp')], list(Transmission_Type=mtcars$am,  
                                             Cylinders=mtcars$cyl), mean)

 aggregate(mpg ~ am, my.mtcars, mean)

 # Do the mean aggregation of MPG by Transmission and cylinder type
 aggregate(mpg~am+cyl, my.mtcars, mean)

 # Do the mean aggregation of MPG & weight by transmission and cylinder type
 aggregate(cbind(mpg,wt,hp) ~ am+cyl, my.mtcars, mean) 

 # Aggregate has an alternative calling sequence
 aggregate(my.mtcars[c('mpg','wt','hp')], 
          by=list(am=my.mtcars$am, cyl=my.mtcars$cyl), mean)

 # The Reshape package

 library(reshape)
 my.melt <- melt(my.mtcars, id.vars=c("cyl","am"))
 my.melt
 cast(my.melt, cyl ~ variable, mean)
 cast(my.melt, cyl ~ variable| am, mean,subset=variable %in% c("hp","wt"))

 # Let's work with some SNP data
 cast(melt(new.snps, id.vars=c("SNP1","SNP2","co")), SNP1+SNP2+co ~ variable,mean)

 aggregate(new.snps[c('X1','X2')], 
          by=list(SNP1=new.snps$SNP1, SNP2=new.snps$SNP2, CO=new.snps$co), mean)

 # But we can provide our own function as an argument. 
 # In this case the mean and standard deviation
 myFunction <- function(x) {mx=mean(x);sdx=sd(x);mz=c(mean=mx,sd=sdx)}
 aggregate(new.snps[c('X1','X2')], by=list(SNP1=my.snps$SNP1,SNP2=my.snps$SNP2), myFunction )

 # What would the reshape solution look like ?
 cast(melt(new.snps,id.vars=c("SNP1","SNP2","co")),SNP1 +SNP2 ~ variable, myFunction)
	#
	# Steve Pittard - [email protected], 03/19/12
	# Code to illustrate motivations for using apply function
	#
	# See www.bimcore.emory.edu/bbseries for slides and code downloads
	#
	# References include:
	# http://statland.org/R/R/Rpulse2.htm , http://www.cyclismo.org/tutorial/R/tables.html#manipulations
	# http://nsaunders.wordpress.com/2010/08/20/a-brief-introduction-to-apply-in-r/
	#

	data(mtcars)
	?mtcars

	by(mtcars$mpg, list(am=mtcars$am, cyl=mtcars$cyl), mean)
	by(mtcars[, c('hp','wt')], list(am=mtcars$am), mean)
	by(mtcars['mpg'], list(am=mtcars$am, cyl=mtcars$cyl), mean)

	tapply(mtcars$mpg, mtcars$am,mean)
	tapply(mtcars$mpg, list(mtcars$am,mtcars$cyl), mean)

	###
	# Let's talk about aggregation now that we know something about the apply functions
	###

	# Simple tables
	letters[1:10] # Built in letters char vector
	my.sample <- sample(letters[1:10], 20, replace=TRUE) # sample 20 times from the 1st 10 letters with replacement
	table(my.sample) # Counts frequency of letters

	# The table is much simpler than writing a for loop
	# We will now use the internal mtcars data set - assumes some factors are present
	my.mtcars <- transform(mtcars, am=factor(am, labels=c("auto","manual")))
	my.table <- table(my.mtcars$cyl,my.mtcars$am)

	# We can even add in another group - Let's try number of gears
	my.3d.table <- table(my.mtcars$cyl, my.mtcars$am, my.mtcars$gear)
	my.3d.table

	# Here we create some categories on the fly using cut
	with(my.mtcars, table(cut(mpg, quantile(mpg)), cyl))
	with(my.mtcars, table(cut(mpg, quantile(mpg), labels=c("Big Gas Hog","Gas Hog","Not Bad","The Best")),
	cyl))

	# The xtabs function also summarizes things. It uses a formula notation
	xtabs(~SNP1 + SNP2, new.snps)
	xtabs(~am + cyl, my.mtcars)
	xtabs(~am + cyl + gear, my.mtcars)

	class(my.table)
	my.table

	# Tables are easily visualized
	barplot(my.table, legend=TRUE, col=c("red","blue","green"), ylim=c(0,20))
	plot(my.table, col=c("red","blue","green"), main="Mosaic Plot")

	# You can see the margin totals
	addmargins(my.table)
	addmargins(my.table, 1)
	addmargins(my.table, 2)

	# Working with table proportions
	addmargins(prop.table(my.table))
	addmargins(prop.table(my.table, 2)) # Look at proportions columnwise
	addmargins(prop.table(my.table, 1)) # Look at proportions row wise

	##
	# Let's explore the aggregate function
	##
	# The aggregate function in this case is "mean" but could be something else

	aggregate(mtcars['mpg'], list(Transmission=mtcars$am),mean)
	aggregate(mtcars[c('mpg','hp')], list(Transmission_Type=mtcars$am), mean)
	aggregate(mtcars[c('mpg','hp')], list(Transmission_Type=mtcars$am,
	Cylinders=mtcars$cyl), mean)

	aggregate(mpg ~ am, my.mtcars, mean)

	# Do the mean aggregation of MPG by Transmission and cylinder type
	aggregate(mpg~am+cyl, my.mtcars, mean)

	# Do the mean aggregation of MPG & weight by transmission and cylinder type
	aggregate(cbind(mpg,wt,hp) ~ am+cyl, my.mtcars, mean)

	# Aggregate has an alternative calling sequence
	aggregate(my.mtcars[c('mpg','wt','hp')],
	by=list(am=my.mtcars$am, cyl=my.mtcars$cyl), mean)

	# The Reshape package

	library(reshape)
	my.melt <- melt(my.mtcars, id.vars=c("cyl","am"))
	my.melt
	cast(my.melt, cyl ~ variable, mean)
	cast(my.melt, cyl ~ variable\| am, mean,subset=variable %in% c("hp","wt"))

	# Let's work with some SNP data
	cast(melt(new.snps, id.vars=c("SNP1","SNP2","co")), SNP1+SNP2+co ~ variable,mean)

	aggregate(new.snps[c('X1','X2')],
	by=list(SNP1=new.snps$SNP1, SNP2=new.snps$SNP2, CO=new.snps$co), mean)

	# But we can provide our own function as an argument.
	# In this case the mean and standard deviation
	myFunction <- function(x) {mx=mean(x);sdx=sd(x);mz=c(mean=mx,sd=sdx)}
	aggregate(new.snps[c('X1','X2')], by=list(SNP1=my.snps$SNP1,SNP2=my.snps$SNP2), myFunction )

	# What would the reshape solution look like ?
	cast(melt(new.snps,id.vars=c("SNP1","SNP2","co")),SNP1 +SNP2 ~ variable, myFunction)