Skip to content

Instantly share code, notes, and snippets.

@aaronsaunders
Created August 9, 2013 07:06
Show Gist options
  • Save aaronsaunders/6191702 to your computer and use it in GitHub Desktop.
Save aaronsaunders/6191702 to your computer and use it in GitHub Desktop.
A Survey of Data Aggregation Techniques in R
#
# Steve Pittard - [email protected], 03/19/12
# Code to illustrate motivations for using apply function
#
# See www.bimcore.emory.edu/bbseries for slides and code downloads
#
# References include:
# http://statland.org/R/R/Rpulse2.htm , http://www.cyclismo.org/tutorial/R/tables.html#manipulations
# http://nsaunders.wordpress.com/2010/08/20/a-brief-introduction-to-apply-in-r/
#
#my.snps <- read.csv("/Users/fender/RSPH.DIR/CLASS.DIR/nsnps.csv",sep=" ",header=F)
my.snps <- read.table("http://www.bimcore.emory.edu/nsnps.csv",header=F,sep=" ")
names(my.snps) <- c("X1", "X2", "SNP1", "SNP2")
new.snps <- cbind(my.snps, co=sample(c("case","control"), 30, TRUE))
# Begin Initializations - Let's generate some matrices for example use.
# To do this I actually use some of the functions that I'm trying to
# explain in the video. So, at first, just ignore this first section.
new.snps <- read.csv("new_snps.csv", sep=",", header=TRUE)
test = list()
length(test) = 6 # number of data frame columns
test = sapply(test,function(x) rnorm(10,mean=2.5,sd=1))
mat.1K = replicate(6,rnorm(1000,mean=5,sd=1)) # 1,000 rows by 6 cols
colnames(mat.1K)=paste(rep("B",6),1:6,sep="")
mat.10K = replicate(6,rnorm(10000,mean=5,sd=8)) # 10,000 rows by 6 cols
# End initializations
##
# Given a matrix like those above (e.g. mat.1K)
# The goal is to compute relative percentages down each column (across all rows)
# Each element is divided by the sum of the column times 100
#
# Here is one way that a newcomer to R might approach this problem. Its okay if
# you do this because this would be commonly done in other languages.
##
hold.df = mat.1K
rows = nrow(mat.1K)
for (jj in 1:nrow(hold.df)) {
for (kk in 1:ncol(hold.df)) {
hold.df[jj,kk] = mat.1K[jj,kk]/sum(mat.1K[1:rows,kk])*100
}
}
# Verify that it worked - all columns should add up to 100
for (ii in 1:ncol(hold.df)) {
print(sum(hold.df[,ii]))
}
# And we might put this into a function definition for later reuse
my.first.percent.func = function(temp.mat) {
hold.mat = temp.mat
rows = nrow(temp.mat)
for (jj in 1:nrow(hold.mat)) {
for (kk in 1:ncol(hold.mat)) {
hold.mat[jj,kk] = temp.mat[jj,kk]/sum(temp.mat[1:rows,kk])*100
}
}
return(hold.mat)
}
# Convince yourself that it worked by printing out the sum of the columns
# They should all be equal to 100
hold.df = my.first.percent.func(mat.1K)
for (ii in 1:ncol(hold.df)) {
print(sum(hold.df[,ii]))
}
# We can eliminate the outer for-loop if we know something about bracket notation
# when referencing matrices in R. It now looks somewhat cleaner.
hold.df = mat.1K
for (ii in 1:ncol(hold.df)) {
hold.df[,ii] = hold.df[,ii]/sum(hold.df[,ii])*100
}
# Let's update the previous function
my.second.percent.func = function(temp.df) {
for (ii in 1:ncol(temp.df)) {
temp.df[,ii] = temp.df[,ii]/sum(temp.df[,ii])*100
}
return(temp.df)
}
# all.equal compares the values of its arguments to see if they are in fact equal
all.equal(my.first.percent.func(mat.1K),my.second.percent.func(mat.1K))
# Let's see if this change made the timing any better... It should !
system.time(my.first.percent.func(mat.1K)) # has 2 for loops
system.time(my.second.percent.func(mat.1K)) # has 1 for loop
# But what do you do if you want to compute percents for each row across all the columns ?
# We could transpose the matrix.
row.perc = t(my.second.percent.func(t(mat.1K)))
# But if we don't want to do this then we need to recode the logic.
# Let's add an argument called "rc" that lets us provide a 1 or 2 to correspond to "row" or "column" respectively
#
my.third.percent.func = function(temp.df,rc) {
if (rc == 1) { # rc of 1 means rows
for (ii in 1:nrow(temp.df)) {
temp.df[ii,] = temp.df[ii,]/sum(temp.df[ii,])*100
}
} else { # We lazily assume that if it isn't 1 then it must be 2
for (ii in 1:ncol(temp.df)) {
temp.df[,ii] = temp.df[,ii]/sum(temp.df[,ii])*100
}
}
return(temp.df)
}
all.equal(my.first.percent.func(mat.1K),my.second.percent.func(mat.1K),my.third.percent.func(mat.1K,2))
# Nice work. But what if we wanted to change what it is we are computing ? We would have to make
# an argument for that.
#
# But there is a "cleaner" way to do this. Check out the apply function
col.mat = apply(mat.1K,2,function(x) x/sum(x)*100) # The "2" represents columns
# The following should be equal
all.equal(col.mat,my.third.percent.func(mat.1K,2))
system.time(my.first.percent.func(mat.1K)) # Two for loops
system.time(my.second.percent.func(mat.1K)) # One for loop
system.time(apply(mat.1K,2,function(x) x/sum(x)*100)) # no for loops
# Now let's look at the timings for the 10,000 row matrix
system.time(my.first.percent.func(mat.10K)) # Two for loops
system.time(my.second.percent.func(mat.10K)) # One for loop
system.time(apply(mat.10K,2,function(x) x/sum(x)*100)) # no for loops
# What if you wanted to apply your percentage function to the rows ?
# Or what if you wanted to change the function ? No recoding required.
row.df = apply(mat.1K,1,function(x) x/sum(x)*100) # The "1" represents rows
# Note - you will need to take the transpose of row.df to get the matrix to look
# like the original. This is because apply across rows returns a transpose.
apply(mat.1K,2,mean)
apply(mat.1K,2,summary)
apply(mat.1K,1,mean)
by(mtcars$mpg, list(am=mtcars$am, cyl=mtcars$cyl), mean)
by(mtcars[,c('hp','wt')], list(am=mtcars$am), mean)
by(mtcars['mpg'], list(am=mtcars$am, cyl=mtcars$cyl), mean)
tapply(mtcars$mpg,mtcars$am,mean)
tapply(mtcars$mpg,list(mtcars$am,mtcars$cyl),mean)
###
# Let's talk about aggregation now that we know something about the apply functions
###
# Simple tables
letters[1:10] # Built in letters char vector
my.sample = sample(letters[1:10],20,replace=TRUE) # sample 20 times from the 1st 10 letters with replacement
table(my.sample) # Counts frequency of letters
# The table is much simpler than writing a for loop
# We will now use the internal mtcars data set - assumes some factors are present
table(new.snps$SNP1,new.snps$co)
my.mtcars = transform(mtcars,am=factor(am,labels=c("auto","manual")))
my.table = table(my.mtcars$cyl,my.mtcars$am)
# We can even add in another group - Let's try number of gears
my.3d.table = table(my.mtcars$cyl,my.mtcars$am,my.mtcars$gear)
my.3d.table
# Here we create some categories on the fly using cut
with(my.mtcars,table(cut(mpg,quantile(mpg)),cyl))
with(my.mtcars,table(cut(mpg,quantile(mpg),labels=c("Big Gas Hog","Gas Hog","Not Bad","The Best")),cyl))
# The xtabs function also summarizes things. It uses a formula notation
xtabs(~SNP1 + SNP2,new.snps)
xtabs(~am + cyl,my.mtcars)
xtabs(~am + cyl + gear,my.mtcars)
class(my.table)
my.table
# Tables are easily visualized
barplot(my.table,legend=T,col=c("red","blue","green"),ylim=c(0,20))
plot(my.table,col=c("red","blue","green"),main="Mosaic Plot")
# You can see the margin totals
addmargins(my.table)
addmargins(my.table,1)
addmargins(my.table,2)
# Working with table proportions
addmargins(prop.table(my.table))
addmargins(prop.table(my.table,2)) # Look at proportions columnwise
addmargins(prop.table(my.table,1)) # Look at proportions row wise
##
# Let's explore the aggregate function
##
# The aggregate function in this case is "mean" but could be something else
aggregate(mtcars['mpg'],list(Transmission=mtcars$am),mean)
aggregate(mtcars[c('mpg','hp')],list(Transmission_Type=mtcars$am),mean)
aggregate(mtcars[c('mpg','hp')],list(Transmission_Type=mtcars$am,Cylinders=mtcars$cyl),mean)
#
aggregate(mpg~am,my.mtcars,mean)
# Do the mean aggregation of MPG by Transmission and cylinder type
aggregate(mpg~am+cyl,my.mtcars,mean)
# Do the mean aggregation of MPG & weight by transmission and cylinder type
aggregate(cbind(mpg,wt,hp)~am+cyl,my.mtcars,mean)
# Look at the SNP data
aggregate(cbind(X1,X2)~co+SNP1+SNP2,new.snps,mean)
# Aggregate has an alternative calling sequence
aggregate(my.mtcars[c('mpg','wt','hp')],by=list(am=my.mtcars$am,cyl=my.mtcars$cyl),mean)
# The Reshape package
library(reshape)
my.melt = melt(my.mtcars,id.vars=c("cyl","am"))
my.melt
cast(my.melt, cyl ~ variable, mean)
cast(my.melt, cyl ~ variable| am, mean,subset=variable %in% c("hp","wt"))
# Let's work with some SNP data
cast(melt(new.snps,id.vars=c("SNP1","SNP2","co")),SNP1 +SNP2 + co ~ variable,mean)
aggregate(new.snps[c('X1','X2')],by=list(SNP1=new.snps$SNP1,SNP2=new.snps$SNP2,CO=new.snps$co),mean)
# But we can provide our own function as an argument. In this case the mean and standard deviation
f = function(x) {mx=mean(x);sdx=sd(x);mz=c(mean=mx,sd=sdx)}
aggregate(new.snps[c('X1','X2')],by=list(SNP1=my.snps$SNP1,SNP2=my.snps$SNP2),f)
# What would the reshape solution look like ?
cast(melt(new.snps,id.vars=c("SNP1","SNP2","co")),SNP1 +SNP2 ~ variable,f)
# Using the R SQLDF Library
options(gsubfn.engine = "R")
library(sqldf)
DF = my.snps
sqldf("select SNP1,SNP2,avg(X1),avg(X2),stdev(X1),stdev(X2) from DF group by SNP1,SNP2")
# Let's labels things better using SQL
sqldf("select SNP1,SNP2,avg(X1) as 'Mean X1',avg(X2) as 'Mean X2',stdev(X1) as 'Stdev X1', stdev(X2) as 'Stdev X2' from DF group by SNP1,SNP2",drv="SQLite")
DF = my.mtcars
tempdf = sqldf("select * from DF where mpg < 20 and am = 'manual'")
sqldf("select avg(mpg),avg(hp) from (select * from DF where am == 'auto')")
sqldf("select avg(mpg),avg(hp) from (select * from DF where am == 'manual')")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment