jwbargsten · September 3, 2012 21:09
diff --git a/corr_light_intensity.R b/corr_light_intensity.R
 # Copyright (c) 2012 Joachim Bargsten
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
 # documentation files (the "Software"), to deal in the Software without restriction, including without
 # limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 # the Software, and to permit persons to whom the Software is furnished to do so, subject to the following
 # conditions:

 # The above copyright notice and this permission notice shall be included in all copies or substantial
 # portions of the Software.

 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT
 # LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO
 # EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
 # AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
 # OR OTHER DEALINGS IN THE SOFTWARE.

 library(colorRamps)
 library(grid)
 library(reshape2)
 library(reshape)
 library(scales)
 library(ggplot2)
 library(plyr)
 library(gdata)
 library(Hmisc)

 comma_format2 <- function (...) { function(x) comma(round(x,2), ...) }

 data.raw <- read.csv("correlation_hanzi_2012-08-20.csv")
 correlation_heatmap_file <- "correlation_matrix.fdr.png"
 correlation_boxplot_file <- "light-intensity_vs_all.png"
 correlation_boxplot_jitter_file <- "light-intensity_vs_all-jitter.png"
 correlation_data_file <- "correlation.results.csv"


 data <- data.raw[, names(data.raw) != "repetition"]

 ## omit na introduced by excel
 data <- data[!is.na(data[,"genotypes"]),]
 ## show NAs
 colwise(function(x) { sum(is.na(x)) })(data)

 ## replace NA by mean per genotype
 data.nona <- ddply(
    data,
    .(genotypes),
    colwise(function(x) { if(is.numeric(x)) x[is.na(x)] <- mean(na.omit(x)); x })
 )

 ## check again for NAs
 colwise(function(x) { sum(is.na(x)) })(data.nona)

 ## normalise data
 data.scaled <- colwise(function(x) { if(is.numeric(x)) return(scale(x)); x })(data.nona)

 ## rearrange rows for correlation analysis 
 ## (makes it easier to extract the correlation and p-values as submatrix)
 first_rows <- c("genotypes", "light_intensity")
 data.scaled <- cbind(
    data.scaled[,names(data.scaled) %in% first_rows],
    data.scaled[,!names(data.scaled) %in% first_rows]
 )

 ## only do the correlation on numeric columns
 data.numcols <- unlist(colwise(is.numeric)(data.scaled))

 ## do the correlation by genotype
 ds.cor <- ddply(
    data.scaled,
    .(genotypes),
    function(ds,numcols) { 
        ## what is the genotype of the subset
        genotype <- ds[1,"genotypes"]

        ## calculate the correlation of all numeric columns
        ds.corprob <- rcorr(as.matrix(ds[,numcols]), type="pearson")
        ds.corprob$P[!is.na(ds.corprob$P)] <- p.adjust(ds.corprob$P[!is.na(ds.corprob$P)], method="fdr")

        ## separate light intensity from the rest
        ds.cor <- ds.corprob$r[-1,1]
        ds.prob <- t(ds.corprob$P[1,-1])[1,]
        ds.cor.long <- data.frame(
                test=names(ds.cor),
                correlation=ds.cor,
                p.value=ds.prob,
                genotypes=rep(genotype, length(ds.cor)),
                covariates=rep("light_intensity", length(ds.cor))
        )

        ds.cor.long
    },
    data.numcols
 )

 ## create the heatmap plot
 p <- ggplot(ds.cor, aes(test, genotypes)) +
    geom_tile(data=subset(ds.cor, !is.na(correlation) & p.value < 0.05), aes(fill=correlation),colour="white") +
    scale_fill_gradient2(low="steelblue", high="darkred",breaks=seq(-0.7,0.7,0.1), labels=comma_format2()) +
    geom_tile(data=subset(ds.cor, is.na(correlation) | p.value > 0.05 ), aes(colour=NA, fill=correlation), linetype=1, fill="#dfdfdf", colour="white")

 p <- p + opts(
              axis.ticks  = theme_blank(),
              axis.text.x = theme_text(angle=90,vjust=0,hjust=1),
              axis.text.y = theme_text(face="bold", size=8, hjust=1)
              )

 p <- p + scale_y_discrete( expand=c(0,0)) + scale_x_discrete(expand=c(0,0))

 ## plot the heatmap
 png(correlation_heatmap_file,res=300, width=19, height=17, units="cm")
 p + facet_grid(covariates ~ .) 
 dev.off()

 ## create a boxplot of all vs light intenisty
 png(correlation_boxplot_file,res=300, width=30, height=20, units="cm")
 data.box <- data.nona
 data.box[,"light_intensity"] <- as.factor(as.character(data.box[,"light_intensity"]))
 data.box.long <- melt(data.box)
 p <- ggplot(data.box.long, aes(light_intensity, value)) + 
    geom_boxplot() + 
    facet_wrap(~ variable, scales="free")
 p
 dev.off()

 ## same as above, only with data points included
 png(correlation_boxplot_jitter_file,res=300, width=30, height=20, units="cm")
 p + geom_jitter(aes(light_intensity, value, colour=genotypes))
 dev.off()

 write.csv(ds.cor, correlation_data_file)
	# Copyright (c) 2012 Joachim Bargsten
	#
	# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
	# documentation files (the "Software"), to deal in the Software without restriction, including without
	# limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
	# the Software, and to permit persons to whom the Software is furnished to do so, subject to the following
	# conditions:

	# The above copyright notice and this permission notice shall be included in all copies or substantial
	# portions of the Software.

	# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT
	# LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO
	# EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
	# AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
	# OR OTHER DEALINGS IN THE SOFTWARE.

	library(colorRamps)
	library(grid)
	library(reshape2)
	library(reshape)
	library(scales)
	library(ggplot2)
	library(plyr)
	library(gdata)
	library(Hmisc)

	comma_format2 <- function (...) { function(x) comma(round(x,2), ...) }

	data.raw <- read.csv("correlation_hanzi_2012-08-20.csv")
	correlation_heatmap_file <- "correlation_matrix.fdr.png"
	correlation_boxplot_file <- "light-intensity_vs_all.png"
	correlation_boxplot_jitter_file <- "light-intensity_vs_all-jitter.png"
	correlation_data_file <- "correlation.results.csv"


	data <- data.raw[, names(data.raw) != "repetition"]

	## omit na introduced by excel
	data <- data[!is.na(data[,"genotypes"]),]
	## show NAs
	colwise(function(x) { sum(is.na(x)) })(data)

	## replace NA by mean per genotype
	data.nona <- ddply(
	data,
	.(genotypes),
	colwise(function(x) { if(is.numeric(x)) x[is.na(x)] <- mean(na.omit(x)); x })
	)

	## check again for NAs
	colwise(function(x) { sum(is.na(x)) })(data.nona)

	## normalise data
	data.scaled <- colwise(function(x) { if(is.numeric(x)) return(scale(x)); x })(data.nona)

	## rearrange rows for correlation analysis
	## (makes it easier to extract the correlation and p-values as submatrix)
	first_rows <- c("genotypes", "light_intensity")
	data.scaled <- cbind(
	data.scaled[,names(data.scaled) %in% first_rows],
	data.scaled[,!names(data.scaled) %in% first_rows]
	)

	## only do the correlation on numeric columns
	data.numcols <- unlist(colwise(is.numeric)(data.scaled))

	## do the correlation by genotype
	ds.cor <- ddply(
	data.scaled,
	.(genotypes),
	function(ds,numcols) {
	## what is the genotype of the subset
	genotype <- ds[1,"genotypes"]

	## calculate the correlation of all numeric columns
	ds.corprob <- rcorr(as.matrix(ds[,numcols]), type="pearson")
	ds.corprob$P[!is.na(ds.corprob$P)] <- p.adjust(ds.corprob$P[!is.na(ds.corprob$P)], method="fdr")

	## separate light intensity from the rest
	ds.cor <- ds.corprob$r[-1,1]
	ds.prob <- t(ds.corprob$P[1,-1])[1,]
	ds.cor.long <- data.frame(
	test=names(ds.cor),
	correlation=ds.cor,
	p.value=ds.prob,
	genotypes=rep(genotype, length(ds.cor)),
	covariates=rep("light_intensity", length(ds.cor))
	)

	ds.cor.long
	},
	data.numcols
	)

	## create the heatmap plot
	p <- ggplot(ds.cor, aes(test, genotypes)) +
	geom_tile(data=subset(ds.cor, !is.na(correlation) & p.value < 0.05), aes(fill=correlation),colour="white") +
	scale_fill_gradient2(low="steelblue", high="darkred",breaks=seq(-0.7,0.7,0.1), labels=comma_format2()) +
	geom_tile(data=subset(ds.cor, is.na(correlation) \| p.value > 0.05 ), aes(colour=NA, fill=correlation), linetype=1, fill="#dfdfdf", colour="white")

	p <- p + opts(
	axis.ticks = theme_blank(),
	axis.text.x = theme_text(angle=90,vjust=0,hjust=1),
	axis.text.y = theme_text(face="bold", size=8, hjust=1)
	)

	p <- p + scale_y_discrete( expand=c(0,0)) + scale_x_discrete(expand=c(0,0))

	## plot the heatmap
	png(correlation_heatmap_file,res=300, width=19, height=17, units="cm")
	p + facet_grid(covariates ~ .)
	dev.off()

	## create a boxplot of all vs light intenisty
	png(correlation_boxplot_file,res=300, width=30, height=20, units="cm")
	data.box <- data.nona
	data.box[,"light_intensity"] <- as.factor(as.character(data.box[,"light_intensity"]))
	data.box.long <- melt(data.box)
	p <- ggplot(data.box.long, aes(light_intensity, value)) +
	geom_boxplot() +
	facet_wrap(~ variable, scales="free")
	p
	dev.off()

	## same as above, only with data points included
	png(correlation_boxplot_jitter_file,res=300, width=30, height=20, units="cm")
	p + geom_jitter(aes(light_intensity, value, colour=genotypes))
	dev.off()

	write.csv(ds.cor, correlation_data_file)