mg14 · May 19, 2018 06:08 · mg14 · Aug 21, 2015
diff --git a/ncomms6901-s3.R b/ncomms6901-s3.R
 #' Supplementary Data 2
 #' =======================
 #' ### Supplementary code and figures accompanying *Interconnections among mutations, gene expression, clinical variables and patient outcome in myelodysplastic syndromes*
 #' 
 #' This document contains the complete code used in the analysis. It is purely written in `R` using a series of `R` and `Bioconductor` packages. 
 #' This report has been generated using the `knitr` R package (http://yihui.name/knitr/). 
 #' For a complete list of packages and their versions please have a look at the end of this document.

 #+ options, echo=FALSE, eval=TRUE
 options(width=120)
 pdf.options(pointsize=8)
 opts_chunk$set(dev=c('my_png','pdf'), fig.ext=c('png','pdf'), fig.width=4, fig.height=4, pointsize=8)
 my_png <-  function(file, width, height, pointsize=12, ...) {
 	png(file, width = 1.5*width, height = 1.5*height, units="in", res=72*1.5, pointsize=pointsize, ...)
 }

 #' ### Libraries
 #' Load necessary libraries
 #+ libraries
 library(limma)
 library(org.Hs.eg.db)
 library(RColorBrewer)
 library(AnnotationDbi)
 library(affy)
 library(gcrma)
 library(hgu133plus2.db )
 library(VennDiagram)
 library(org.Hs.eg.db)
 library(GenomicRanges)
 library(GenomicFeatures)
 library(rtracklayer)
 library(biomaRt)
 library(glmnet)
 library(survival)
 library(Hmisc)
 library(randomForestSRC)
 set1 = c(brewer.pal(9,"Set1"), brewer.pal(8, "Dark2"))
 source("suppData/mg14.R")


 #' ## 1. Data preprocessing
 #' ### 1. Normalisation of expression data
 #' This uses the `affy` package and `gcrma` correction. Pretty standard 
 #' We assume you have downloaded the complete tarball GSE58831_RAW.tar from GEO http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE58831 
 #' and untared the archive and gunzipped all .CEL files into directory `GSE58831`. 
 #+ preprocess, cache=TRUE
 celFiles <- dir("GSE58831", pattern = ".CEL", full.names = T)
 celFiles
 affyBatch <- read.affybatch(filenames = celFiles)
 gset = gcrma(affyBatch)
 samples = sub("_.+","", sampleNames(gset))
 sampleNames(gset) = samples

 #' Now merge probes to genes by the means of all probes mapping to a particular entrez id 
 #+ merge, cache=TRUE
 tab <- select(hgu133plus2.db, keys = keys(hgu133plus2.db), columns = c("ENTREZID"))
 e <- exprs(gset)
 geneExpr <- t(sapply(split(tab[,1], tab[,2]), function(ids){
 					colMeans(e[ids,,drop=FALSE])
 				}))
 rm(tab,e)

 #' ### 2. Load mutation and clinical data
 #' Load clinical data for 159 MDS patients and 17 normals from Supplementary Table S1.
 mdsData <- read.table("suppData/SuppTableS1GEO.txt", sep="\t", header=TRUE, check.names=FALSE) ## A tab-delimited version of Supplementary Table S1
 head(mdsData)
 ix <- setdiff(na.omit(match(samples, mdsData$GEOID)), which(is.na(mdsData$PDID))) ## All MDS samples with expression and seq data
 normalSamples <- as.character(mdsData$GEOID[mdsData$Type=="Normal"])

 #' Define mappings
 GEO2PD <- as.character(mdsData$PDID) 
 names(GEO2PD) <- mdsData$GEOID
 PD2GEO <- as.character(mdsData$GEOID)
 names(PD2GEO) <- mdsData$PDID

 #' #### Clinical summary statistics
 f <- function(x) cat(paste(median(x, na.rm=TRUE), " median; ", min(x, na.rm=TRUE),"-",max(x, na.rm=TRUE), " range; ", sum(is.na(x)), " missing", sep=""),"\n")
 mdsIx <- !is.na(mdsData$GEOID[mdsData$Type=="MDS"]) ## All MDS samples with expression data
 table(mdsData$Gender[mdsIx])
 f(mdsData$Age[mdsIx])
 table(mdsData$WHO_category[mdsIx])
 f(as.numeric(as.character(mdsData$BM_blasts_pct[mdsIx])))
 f(as.numeric(as.character(mdsData$Ring_sideroblasts_pct[mdsIx])))
 f(mdsData$Haemoglobin[mdsIx])
 f(mdsData$Absoulte_neutrophile_count[mdsIx])
 f(mdsData$Platelet_count[mdsIx])
 table(mdsData$PB_cytopenia[mdsIx])


 #' ### 3. Match expression and clinical data 
 #' Incrementally construct the design matrix
 design = cbind(offset=1,mdsData[ix, grep("SF3B1|TET2|SRSF2|ASXL1|DNMT3A|RUNX1|U2AF1|TP53|EZH2|IDH2|STAG2|ZRSR2|CBL|BCOR|NRAS|JAK2|CUX1|IDH1|KRAS|PHF6|EP300|GATA2|NPM1|MLL2|PTPN11|CREBBP|KIT|MPL|NF1|WT1|IRF1|RAD21|ATRX|CDKN2A|ETV6|KDM6A|CEBPA|FLT3|GNAS|PTEN|SH2B3|BRAF|CTNNA1", colnames(mdsData))]) # oncogenic mutations
 minF=5 ## Minimal number of alterations
 design = design[,colSums(design)>=minF]
 rownames(design) <- mdsData$GEOID[ix]

 #' Cytogenetics
 cytoImputed <- mdsData[ix, grep("rearr|del|tri|abn|complex|other", colnames(mdsData), value=TRUE)[-12:-11]]
 cytoImputed <- cytoImputed[,colSums(cytoImputed, na.rm=TRUE)>0]
 design <- cbind(design,  cytoImputed[,colSums(cytoImputed, na.rm=TRUE)>=minF], Gender=mdsData[ix,"Gender"], Age=scale(mdsData[ix,"age_imp"], center=TRUE, scale=FALSE))

 #' Include 17 normal samples
 n <- nrow(design)
 design <- rbind(design, matrix(0, nrow=17, ncol=ncol(design), dimnames = list(NULL,colnames(design))))
 design <- cbind(design, Normal=c(rep(0,n), rep(1,17)))
 design[,1] <- 1
 design[n+1:17,"Age"] <- NA #mean(design[1:n,"Age"])
 design[n+1:17,"Gender"] <- geneExpr["7503",normalSamples] > 5 ## XIST expression
 rownames(design)[n+1:17] <- normalSamples
 design <- design[,c(1:17,20,18:19)]

 #' Impute missing by mean
 design0 <- design
 for(j in 1:ncol(design))
 	design[is.na(design[,j]),j] <- mean(design[,j], na.rm=TRUE)

 head(design)

 #' Define colors
 colMutations = c(brewer.pal(8,"Set1")[-6], rev(brewer.pal(8,"Dark2")), brewer.pal(7,"Set2"))[c(1:12,16:19,13:15)]
 o <- order(apply(col2rgb(colMutations),2,rgb2hsv)[1,])
 colMutations <- colMutations[rev(o)][(4*1:19 +15) %% 19 + 1]
 names(colMutations) <- colnames(design)[-1]

 #' Show a venn diagram with the overlap
 #+ MDS-GE-venn, fig.width=1.8, fig.height=1.8, dev.args=list(font="Helvetica")
 par(bty="n", mgp = c(2,.33,0), mar=c(3,3,1,0)+.1, las=1, tcl=-.25)
 grid.newpage()
 pushViewport(viewport(w = .9, h = .9))
 grid.draw(venn.diagram(list(Sequenced=mdsData$GEOID[!is.na(mdsData$PDID)], Expression = samples, Normal=normalSamples), filename=NULL, lty=1, 
 				col=colMutations[1:3], fill=colMutations[1:3], alpha=0.05, euler.d=TRUE, fontfamily="Helvetica", cat.fontfamily="Helvetica", cat.fontface="italic", euler.diagram=TRUE))


 #' ## 2. Model fitting
 #' We use the lmFit function from the `limma` package. This comes with a whole series of powerful and reliable tests.
 glm = lmFit(geneExpr[,rownames(design)], design = design ) 
 glm = eBayes(glm)

 #' ### 1. F-statistic for all but offset
 #' Here we want to determine all genes which are associated with any covariate. This will be based on an F-statistic.
 #' lmFit also tests wether the offset is different from zero (trivially true).
 F.stat <- classifyTestsF(glm[,-1],fstat.only=TRUE) # remove offset
 glm$F <- as.vector(F.stat)
 df1 <- attr(F.stat,"df1")
 df2 <- attr(F.stat,"df2")
 if(df2[1] > 1e6){ # Work around bug in R 2.1
 	glm$F.p.value <- pchisq(df1*glm$F,df1,lower.tail=FALSE)
 }else
 	glm$F.p.value <- pf(glm$F,df1,df2,lower.tail=FALSE)

 #' #### Random model
 #' Compare to a model where all values of the covariates are randomly permuted. If all model assumptions were correct, this wouldn't be needed.
 set.seed(42)
 rlm <- lmFit(geneExpr[,rownames(design)], apply(design, 2, sample))
 rlm <- eBayes(rlm)
 F.stat <- classifyTestsF(rlm[,-1],fstat.only=TRUE)
 rlm$F <- as.vector(F.stat)
 df1 <- attr(F.stat,"df1")
 df2 <- attr(F.stat,"df2")
 if(df2[1] > 1e6){ # Work around bug in R 2.1
 	rlm$F.p.value <- pchisq(df1*rlm$F,df1,lower.tail=FALSE)
 }else
 	rlm$F.p.value <- pf(rlm$F,df1,df2,lower.tail=FALSE)

 #' #### Explained variance by different categories
 #' The F-statistic is directly related to the R2.
 F.stat <- classifyTestsF(glm[,2:16],fstat.only=TRUE) ## All genetics & cytogenetics
 df1 <- attr(F.stat,"df1")
 df2 <- attr(F.stat,"df2")
 F.p.value <- pchisq(df1*F.stat,df1,lower.tail=FALSE)

 R.stat <- classifyTestsF(rlm[,2:16],fstat.only=TRUE) ## Random

 Rall = 1 - 1/(1 + glm$F * (ncol(design)-1)/(nrow(design)-ncol(design)))
 Rgenetics = 1 - 1/(1 + F.stat * 15/(nrow(design)-ncol(design)))
 Pgenetics = 1 - 1/(1 + R.stat * 15/(nrow(design)-ncol(design)))
 names(Rgenetics) <- names(Pgenetics) <- names(Rall) <-  rownames(geneExpr)

 #' Plot the variance explained by genetics
 #+ GE-R2, fig.width=2, fig.height=1.8
 par(bty="n", mgp = c(2,.33,0), mar=c(3,2.5,1,1)+.1, las=1, tcl=-.25, xpd=NA)
 d <- density(Pgenetics,bw=1e-3)
 f <- 1#nrow(gexpr)/512
 plot(d$x, d$y * f, col='grey', xlab=expression(paste("Explained variance per gene ", R^2)), main="", lwd=2, type="l", ylab="", xlim=c(0,0.7))
 title(ylab="Density", line=1.5)
 d <- density(Rgenetics, bw=1e-3)
 r <- min(Rgenetics[p.adjust(F.p.value,"BH")<0.05])
 x0 <- which(d$x>r)
 polygon(d$x[c(x0[1],x0)], c(0,d$y[x0])* f, col=paste(set1[1],"44",sep=""), border=NA)
 lines(d$x, d$y* f, col=set1[1], lwd=2)
 #points(d$x[x0[1]], d$y[x0[1]]*f, col=set1[1], pch=16)
 text(d$x[x0[1]], d$y[x0[1]]*f, pos=4, paste(sum(Rgenetics > r), "genes q < 0.05"))
 arrows(Rgenetics["22"], par("usr")[4]/7, Rgenetics["22"], par("usr")[4]/50, length=0.05)
 text(Rgenetics["22"], par("usr")[4]/8, "ABCB7", font=3, pos=3)
 legend("topright", bty="n", col=c(set1[1], "grey"), lty=1, c("Observed","Random"), lwd=2)

 #' #### Predictions
 glmPrediction <- glm$coefficients %*% t(design)
 rlmPrediction <- rlm$coefficients %*% t(design)

 #' #### ABCB7 expression
 #+ GE-ABCB7,  fig.width=2, fig.height=1.8
 par(bty="n", mgp = c(1.5,.33,0), mar=c(2.5,2.5,1,1)+.1, las=1, tcl=-.25)
 plot(glmPrediction["22",], geneExpr["22",rownames(design)], ylab=expression(paste("Observed ",italic("ABCB7"), " expression")), xlab=expression(paste("Predicted ",italic("ABCB7"), " expression")), pch=16, cex=.8)
 abline(0,1)
 u <- par("usr")
 par(xpd=NA)
 y <- glm$coefficients["22",-1]+glm$coefficients["22",1]
 u <- par("usr")
 x0 <- rep(u[3]+1,ncol(design)-1)
 y0 <- u[4] + 0.05*(u[4]-u[3]) - rank(-y)/length(y) * (u[4]-u[3])/1.2
 d <- density(y)
 lines(d$x, d$y/5+1+u[3], col="grey")
 lines(d$x, -d$y/5+1+u[3], col="grey")
 points(x=y, y=x0+violinJitter(y, magnitude=0.25)$y, col=colMutations, pch=16)
 text(x=glm$coefficients["22",1], y= 1.5 +u[3], "Model coefficients", cex=0.8)
 w <- glm$p.value["22",-1] < 0.01
 rotatedLabel(y[w], x0[w]+0.1, labels=colnames(design)[-1][w], font=ifelse(grepl("[[:lower:]]", colnames(design)[-1]),1,3)[w], cex=.66, pos=1, col=colMutations[w])
 axis(at=-1:1 + glm$coefficients["22",1], labels=-1:1, side=1, cex.axis=.8, line=-1, mgp = c(1.5,.05,0), tcl=-.15)
 #mtext(at=l$coefficients[1], line=-2, side=1, "Coefficients", cex=.8)
 text(u[1],u[4], substitute(paste(R^2==r),list(r=round(Rgenetics["22"],2))), pos=4)
 #text(u[1],u[4]-(u[4]-u[3])*0.1, substitute(paste(R["Genetics"]^2==r),list(r=round(Rgenetics["22"],2))), pos=4)



 #' ### 2. Significant effects per covariate
 #' Prepare the test results using a hierarchical procedure: 
 #' 1. Adjust down transcripts using F-stat
 #' 2. Adjust along covariates
 #+ testResults
 testResults <- decideTests(glm, method="hierarchical",adjust.method="BH", p.value=0.05)[,-1]
 significantGenes <- sapply(1:ncol(testResults), function(j){
 			c <- glm$coefficients[testResults[,j]!=0,j+1]
 			table(cut(c, breaks=c(-5,seq(-1.5,1.5,l=7),5)))
 		})
 colnames(significantGenes) <- colnames(testResults)

 #' #### Barplot
 #+ GE-quantiles,  fig.width=3, fig.height=2
 par(bty="n", mgp = c(2.5,.33,0), mar=c(3,3.3,2,0)+.1, las=2, tcl=-.25)
 b <- barplot(significantGenes, las=2, ylab = "Differentially expressed genes", col=brewer.pal(8,"RdYlBu"), legend.text=FALSE , border=0, xaxt="n")#, col = set1[simple.annot[names(n)]], border=NA)
 rotatedLabel(x0=b, y0=rep(10, ncol(significantGenes)), labels=colnames(significantGenes), cex=.7, srt=45, font=ifelse(grepl("[[:lower:]]", colnames(design))[-1], 1,3), col=colMutations)
 clip(0,30,0,1000)
 #text(b+0.2, colSums(n)+50, colSums(n), pos=3, cex=.7, srt=90)
 x0 <- 21.5
 image(x=x0+c(0,0.8), y=par("usr")[4]+seq(-100,100,l=9), z=matrix(1:8, ncol=8), col=brewer.pal(8,"RdYlBu"), add=TRUE)
 text(x=x0+1.5, y=par("usr")[4]+seq(-50,50,l=3), format(seq(-1,1,l=3),2), cex=0.66)
 lines(x=rep(x0+.8,2), y=par("usr")[4]+c(-75,75))
 segments(x0+.8,par("usr")[4]+seq(-75,75,l=7),x0+.9,par("usr")[4]+seq(-75,75,l=7))
 text(x0+.8, par("usr")[4]+125, "log2 FC", cex=.66)
 rotatedLabel(b-0.1, colSums(significantGenes), colSums(significantGenes), pos=3, cex=, srt=45)


 #' Associated mutations per transcript:
 #+ GE-transcripts,  fig.width=2, fig.height=2
 par(bty="n", mgp = c(2.5,.33,0), mar=c(3,3.3,3,0)+.1, las=1, tcl=-.25)
 t <- table(rowSums(abs(testResults[,1:16])))
 b <- barplot(t[-1],ylab="Differentially expressed genes", col=rev(brewer.pal(7, "Spectral")[-(4:5)]), border=NA)
 rotatedLabel(b-0.1, t[-1], t[-1], pos=3, cex=1, srt=45)
 title(xlab="Associated drivers", line=2)


 #' #### Venn diagrams for significant effects
 #+ GE-Venn, fig.width=1.8, fig.height=1.8, dev.args=list(font="Helvetica")
 v <- apply(testResults!=0,2, which)
 par(bty="n", mgp = c(2,.33,0), mar=c(3,3,1,0)+.1, las=1, tcl=-.25)
 grid.newpage()
 pushViewport(viewport(w = .9, h = .9))
 w <- c("SF3B1","SRSF2","U2AF1","ZRSR2")
 grid.draw(venn.diagram(v[w], filename=NULL, lty=1, col=colMutations[w][c(1,3,4,2)], fill=colMutations[w], alpha=0.05, euler.d=TRUE, fontfamily="Helvetica", cat.fontfamily="Helvetica", cat.fontface="italic"))
 grid.newpage()
 pushViewport(viewport(w = .9, h = .9))
 w <- names(sort(colSums(testResults!=0), decreasing = TRUE))[1:4]
 grid.draw(venn.diagram(v[w], filename=NULL, lty=1, col=colMutations[w], fill=colMutations[w], alpha=0.05, euler.d=TRUE, fontfamily="Helvetica", cat.fontfamily="Helvetica", cat.fontface="italic"))


 #' #### Chromosomal distribution
 chr = factor(sapply(AnnotationDbi::mget(rownames(geneExpr), org.Hs.egCHR, ifnotfound=NA), `[`,1), levels=c(1:22, "X","Y","MT"))
 chromTable <- apply(testResults,2, function(x) table(chr[x!=0]))

 #+ GE-CHR,  fig.width=5, fig.height=4
 par(bty="n", mgp = c(0.5,0.5,0), las=1, tcl=-.25, font.main=3, mfrow=c(4,5), xpd=NA, mar=c(0,0,1.5,0))
 for(j in 1:ncol(testResults)){
 	n <- sum(testResults[,j]!=0)
 	pie(chromTable[,j], col=colorRampPalette(brewer.pal(11,'Spectral'))(24), border="white",  radius=0.8, init.angle=90, labels=ifelse(chromTable[,j]/sum(chromTable[,j]) > 0.02, paste("",rownames(chromTable), "(" ,chromTable[,j], ")",sep=""),""))
 	title(main = colnames(chromTable)[j], font.main = ifelse(grepl("[[:lower:]]", colnames(chromTable)[j]), 1,3), cex.main=1.33)
 	symbols(0,0,circles=.3, inches=FALSE, col="white", bg="white", lty=0, add=TRUE)
 	#symbols(0,0,circles=.8*(1-sqrt(n/max(colSums(testResults!=0)))), col="white", add=TRUE, lty=0, bg="white", inches=FALSE)
 	#cat(n,"\n")
 }
 t <- table(chr)
 pie(t, col=colorRampPalette(brewer.pal(11,'Spectral'))(24), border="white",  radius=0.8, cex.main=1.33, labels=ifelse(t/sum(t) > 0.02, names(t),""), init.angle=90)
 symbols(0,0,circles=.3, inches=FALSE, col="white", bg="white", lty=0, add=TRUE)
 title(main = "# Genes", font.main = ifelse(grepl("[[:lower:]]", colnames(chromTable)[j]), 1,3))


 #' ### 3. Heatmaps for Figure 1b
 #+ Genotypes,  fig.width=1, fig.height=5
 par(bty="n", mgp = c(2,.33,0), mar=rep(0,4), las=1, tcl=-.25, xpd=NA)
 plot(NA,NA, xlim=c(0,ncol(design)-1), ylim=c(0,nrow(design)), xaxt="n", yaxt="n", xlab="",ylab="", xaxs="i", yaxs="i")
 z <- design[,-1]
 h <- hclust(dist(z[,1:16]))
 j <- hclust(dist(t(z)))
 rasterImage(sapply(1:ncol(z), function(i) ifelse(z[,i]>0, colMutations[i-1], "#FFFFFF"))[h$order,j$order], 0, 0, ncol(design)-1, nrow(design), interpolate=FALSE)

 #+ Expression,  fig.width=1, fig.height=10
 par(bty="n", mgp = c(2,.33,0), mar=rep(0,4), las=1, tcl=-.25, xpd=NA)
 w <- names(sort(Rgenetics, decreasing = TRUE)[1:1000])
 z <- geneExpr[w,]-rowMeans(geneExpr[w,])
 h <- hclust(dist(z))
 i <- hclust(dist(t(z)))
 plot(NA,NA, xlim=c(0,ncol(geneExpr)), ylim=c(0,1000), xaxt="n", yaxt="n", xlab="",ylab="", xaxs="i", yaxs="i")
 rasterImage(matrix(brewer.pal(11,"RdBu")[cut(z[h$order,i$order], 12)], ncol=ncol(geneExpr)), 0,0,ncol(geneExpr),1000, interpolate=FALSE)

 #+ Coefficients,  fig.width=1, fig.height=5
 par(bty="n", mgp = c(2,.33,0), mar=rep(0,4), las=1, tcl=-.25, xpd=NA)
 plot(NA,NA, xlim=c(0,ncol(design)-1), ylim=c(0,1000), xaxt="n", yaxt="n", xlab="",ylab="", xaxs="i", yaxs="i")
 rasterImage(matrix(brewer.pal(11,"RdBu")[cut(glm$coefficients[w,-1][h$order,j$order], seq(-3,3,l=12))], ncol=ncol(design)-1), 0,0,ncol(design)-1,1000, interpolate=FALSE)


 #' ### 4. Test for gene and cytogenetic interactions
 #' Here we want to compare how mutational co-occurrence and the sets of transcriptional changes are interlinked
 genomicData = mdsData[,colnames(design)[2:17]]
 interactions <- interactionsGenes <- sapply(1:ncol(genomicData), function(i) sapply(1:ncol(genomicData), function(j) {f<- try(fisher.test(genomicData[,i], genomicData[,j]), silent=TRUE); if(class(f)=="try-error") 0 else ifelse(f$estimate>1, -log10(f$p.val),log10(f$p.val))} ))
 oddsRatio <- oddsGenes <- sapply(1:ncol(genomicData), function(i) sapply(1:ncol(genomicData), function(j) {f<- try(fisher.test(genomicData[,i] + .5, genomicData[,j] +.5), silent=TRUE); if(class(f)=="try-error") f=NA else f$estimate} ))
 w <- p.adjust(glm$F.p.value,"BH")<0.05
 oddsExpression <- sapply(1:ncol(genomicData), function(i) sapply(1:ncol(genomicData), function(j) {f<- try(fisher.test(abs(testResults[w,i]), abs(testResults[w,j])), silent=TRUE); if(class(f)=="try-error") f=NA else f$estimate} ))
 interactionsExpression <- sapply(1:ncol(genomicData), function(i) sapply(1:ncol(genomicData), function(j) {f<- try(fisher.test(abs(testResults[w,i]), abs(testResults[w,j])), silent=TRUE); if(class(f)=="try-error") 0 else ifelse(f$estimate>1, -log10(f$p.val),log10(f$p.val))} ))
 oddsRatio[lower.tri(oddsRatio)] <- oddsExpression[lower.tri(oddsExpression)]
 interactions[lower.tri(interactions)] <- interactionsExpression[lower.tri(interactions)]

 diag(interactions) <- NA
 diag(oddsRatio) <- NA
 colnames(oddsRatio) <- rownames(oddsRatio) <- colnames(interactions) <- rownames(interactions) <- colnames(genomicData)
 oddsRatio[10^-abs(interactions) > 0.05] = 1
 oddsRatio[oddsRatio<1e-3] = 1e-4
 oddsRatio[oddsRatio>1e3] = 1e4
 logOdds=log10(oddsRatio)

 reorder <- function(M, o){
 	u <- M
 	u[lower.tri(u)] <- t(M)[lower.tri(M)]
 	u <- u[o,o]
 	l <- M
 	l[upper.tri(u)] <- t(M)[upper.tri(M)]
 	l <- l[o,o]
 	R <- u
 	R[lower.tri(R)] <- l[lower.tri(R)]
 	return(R)
 }

 #' Plot (*Note: This plot looks a little different as in the paper, where mutation data for all 738 patients is used on the upper panel*).
 #+ Interactions,  fig.width=3.2, fig.height=3.2
 par(bty="n", mgp = c(2,.5,0), mar=rep(4,4)+.1, las=2, tcl=-.33)
 m <- nrow(oddsRatio)
 n <- ncol(oddsRatio)
 o = c(1,11,7,3,4,9,6,10,2,5,8,12:m)#h$order#c(h$order,(length(h$order) +1):ncol(interactions))
 r <- reorder(log10(oddsRatio),o)
 r[lower.tri(r)] <- NA
 image(x=1:n, y=1:m, r, col=brewer.pal(9,"PiYG"), breaks = c(-4:0-.Machine$double.eps,0:4), xaxt="n", yaxt="n", xlab="",ylab="", xlim=c(0, n+4), ylim=c(0, n+4))
 r <- reorder(log10(oddsRatio),o)
 r[upper.tri(r)] <- NA
 image(x=1:n, y=1:m, r, col=brewer.pal(9,"RdBu"), breaks = c(-4:0-.Machine$double.eps,0:4), add=TRUE)
 mtext(side=2, at=1:n, colnames(oddsRatio)[o], font=ifelse(grepl('[[:lower:]]',colnames(oddsRatio)[o]),1,3), col=colMutations[1:16][o])
 rotatedLabel(x0=1:n, y0=rep(0.5, n), colnames(oddsRatio)[o], font=ifelse(grepl('[[:lower:]]',colnames(oddsRatio)[o]),1,3), srt=45, cex=.9, col=colMutations[1:16][o])
 abline(h = length(h$order)+.5, col="white", lwd=1)
 abline(v = length(h$order)+.5, col="white", lwd=1)
 abline(h=0:n+.5, col="white", lwd=.5)
 abline(v=0:n+.5, col="white", lwd=.5)
 text(x=n/2, y=m+.5, "Genetic interactions", pos=3)
 text(x=n+1, y=m/2, "Overlap of expression targets", pos=3, srt=270)
 q <- p.adjust(10^-abs(reorder(interactions,o)), method="BH")
 p <- p.adjust(10^-abs(reorder(interactions,o)), method="holm")
 w = arrayInd(which(q < .1), rep(m,2))
 points(w, pch=".", col="white", cex=1.5)
 w = arrayInd(which(p < .05), rep(m,2))
 points(w, pch="*", col="white")
 image(y = 1:8 +6, x=rep(n,2)+c(2,2.5)+1, z=matrix(c(1:8), nrow=1), col=brewer.pal(8,"PiYG"), add=TRUE)
 image(y = 1:8 +6, x=rep(n,2)+c(2.5,3)+1, z=matrix(c(1:8), nrow=1), col=brewer.pal(8,"RdBu"), add=TRUE)
 axis(side = 4, at = seq(1,7) + 6.5,  tcl=-.15, label=10^seq(-3,3), las=1, lwd=.5)
 mtext(side=4, at=10, "Odds ratio", las=3, line=3)
 par(xpd=NA)
 text(x=n+2.2, y=15, "Correlated", pos=4)
 text(x=n+2.2, y=6-.2, "Exclusive", pos=4)
 points(x=rep(n,2)+3.5, y=1:2, pch=c("*","."))
 image(x=rep(n,2)+c(2,3)+1, y=(3:4) -0.5, z=matrix(1), col=brewer.pal(3,"BrBG"), add=TRUE)
 mtext(side=4, at=1:3, c("Not sig.", "Q < 0.1", "P < 0.05"), line=0.2)



 #' ### 5. Write output
 t= topTable(glm, number=Inf)
 annot <- select(org.Hs.eg.db, rownames(t), c("SYMBOL","GENENAME","CHR","CHRLOC"))
 t <- cbind(annot[!duplicated(annot$ENTREZID),1:5],signif(cbind(t, R2.genetics=Rgenetics[rownames(t)], P.R2.genetics=F.p.value[rownames(t)]), 3), P = signif(glm$p.value[rownames(t),],2),Test=testResults[rownames(t),])
 write.table(t[order(-t$R2.genetics),], file=paste(Sys.Date(),"-SuppTable2.txt",sep=""), sep="\t", row.names = FALSE, quote=FALSE)

 #' ### Contrast against normals
 normalContrast <- rbind(rep(0,18), diag(1,18), rep(-1, 18))[c(1:17,20,18:19),]
 colnames(normalContrast) <- setdiff(colnames(design)[2:20],"Normal")
 normalContrast <- contrasts.fit(glm, normalContrast)
 normalContrast <- eBayes(normalContrast)

 #' ### Expression of mutated genes
 #' The next part will analyse how mutated genes themselves are expressed
 #+ Expression-Drivers,  fig.width=2.5, fig.height=3
 par(bty="n", mgp = c(2.5,0.5,0), las=1, tcl=-.25, font.main=3)
 layout(matrix(1:16, ncol=4, byrow = TRUE), heights=c(1,1,1,0.2))
 w <- grep("[[:lower:]]", colnames(design), invert = TRUE)
 p <- sapply(colnames(design)[w], function(i){
 			geneId <- AnnotationDbi::get(i, revmap(org.Hs.egSYMBOL))
 			
 			c(	p = glm$p.value[geneId,c("Normal",i)],
 			q = normalContrast$p.value[geneId,i])
 		})
 p[] <-  sig2star(p.adjust(p, "BH"), breaks=10^c(-.Machine$double.max.exp,-3:0),labels=c("***","**","*",""))
 for(i in colnames(design)[w]){
 	o <- colSums(p!="")[i]*.5 + (colSums(p!="")[i]>0)*.5
 	par(mar=c(1.2,2.7, o +2,0.5)+.1)
 	x <- design[,i]+1 -design[,"Normal"]
 	geneId <- AnnotationDbi::get(i, revmap(org.Hs.egSYMBOL))
 	v <- do.call("rbind", lapply(split(geneExpr[geneId, rownames(design)], x), violinJitter, magnitude=0.5))
 	plot( sort(x) + v$y, v$x, ylab="", xaxt="n", xlab="", 
 			col=paste(c(colMutations[c('Normal',i)],"#AAAAAA"),"DD", sep="")[c(1,3,2)][sort(x)+1], 
 			xlim=c(-0.5,2.5), cex.main=1, pch = 16, cex=0.5, lwd=0)#genotypes[match(rownames(design), rownames(genotypes)),i]+1)
 	title(ylab="Expression", line=2)
 	title(main=i, line=o +.5)
 	par(xpd=NA)
 	boxplot(at=0:2, geneExpr[geneId, rownames(design)] ~ factor(design[,i] - design[,"Normal"]), add=TRUE, xaxt="n", boxwex=0.5, outline=FALSE, lty=1, outwex=0, staplewex=0, border=c(colMutations[c('Normal',i)],"#777777")[c(1,3,2)])
 	l=0
 	a = list(0:1,1:2, c(0,2))
 	for(j in 3:1){
 		if(p[j,i] != ""){
 		axis(3, at=a[[j]], labels=NA, tcl=0.25, line=l+.5)
 		mtext(side=3, at=a[[j]][1]+.5, p[j,i], line = l)
 		l <- l+.5
 	}
 	}
 	u <- par("usr")
 	rotatedLabel(0:2, rep((u[3]+u[4])/2 - 0.4 * (u[4]-u[3])), paste(table(x),c("Normal", "wt", "mut")))
 	y <- glm$coefficients[AnnotationDbi::get(i, revmap(org.Hs.egSYMBOL)),c("offset",i, "Normal")]
 	lines(0:2, y[1] + c(y[3],0,y[2]))#, pch=21, bg="white",col=c(colMutations[c('normal',i)],"#AAAAAA")[c(1,3,2)], type="b", cex=1.25)
 }


 #' Volcano plot
 #+ Expression-Drivers-Volcano,  fig.width=2.5, fig.height=2
 par(bty="n", mgp = c(2,.33,0), mar=c(3,3,1,1)+.1,  tcl=-.25)
 set.seed(42)
 s <- sample(1:nrow(glm),500)
 plot(glm$coefficients[s,-1], (1/glm$p.value[s,-1]), log="y", xlab="Expression logFC", ylab="1/P-value", col="grey", pch=16)
 x <- sapply(colnames(design)[w], function(i) glm$coefficients[AnnotationDbi::get(i, revmap(org.Hs.egSYMBOL)),i])
 y <-  (1/sapply(colnames(design)[w], function(i) glm$p.value[AnnotationDbi::get(i, revmap(org.Hs.egSYMBOL)),i]))
 points(x, y, pch=16)
 text(x,y,ifelse(y>1e2,names(x),""), pos=3, font=3)


 #+ Expression-Drivers-Heatmap,  fig.width=2.5, fig.height=3
 par(bty="n", mgp = c(2.5,.5,0), mar=c(1,4,4,2)+.1,  tcl=-.25)
 z <- glm$coefficients[unlist(AnnotationDbi::mget(colnames(design)[w], revmap(org.Hs.egSYMBOL))),-1]
 n <- nrow(z)
 m <- ncol(z)
 image(x=1:n,y=1:m, z, col=colorRampPalette(brewer.pal(9,"RdBu"))(19), breaks=seq(-2,2,l=20), xaxt="n", yaxt="n", xlab="",ylab="", xlim=range(0:n)+c(0.5,2.5))
 mtext(side=3, at=1:n, colnames(design)[w], las=2, font=3, line=0.2)
 mtext(side=2, at=1:m, colnames(z), las=2, font=ifelse(grepl("[[:upper:]]", colnames(design))[-1], 3,1), line=0.2)
 text(rep(1:n, m), rep(1:m,each=n), sig2star(p.adjust(glm$p.value[unlist(AnnotationDbi::mget(colnames(design)[w], revmap(org.Hs.egSYMBOL))),-1], method="BH")))
 image(x=n+2 + c(-0.5,0.5), y=1:4, z=matrix(1:4, ncol=4), col=brewer.pal(5,"RdBu")[-5], add=TRUE)
 mtext(side=4, at=1:4, text=-2:1, las=2, adj=1, line=1)


 #' ## 2. Principal components analysis
 pca <- prcomp(t(geneExpr))

 #' ### 1. Explained variance by principal components
 #+ PCA-variance,  fig.width=2.5, fig.height=2
 par(bty="n", mgp = c(2.5,.5,0), mar=c(3,4,1,2)+.1, tcl=-.25, las=1)
 plot(pca$sdev^2/sum(pca$sdev^2), type="h", col=set1[1], xlab="", ylab=expression(paste("Explained variance ", Rgenetics^2)) , ylim=c(0,0.15), yaxs="i")
 mtext(side=1, "Principal component", line=2)
 c <- cumsum(pca$sdev^2)/sum(pca$sdev^2)* pca$sdev[1]^2/sum(pca$sdev^2)
 lines(c  , type="s")
 axis(4, at = pretty(c(0,1))* pca$sdev[1]^2/sum(pca$sdev^2), labels=pretty(c(0,1)))
 legend("bottomright", col=c(set1[1],"black"), lty=1, c("Per PC","Cumulative"), bty="n")
 lines(c(180,20,20),c(c[20],c[20],0), lty=3)

 #' ### 2. Spatial clustering
 #' We test wether samples with a given mutations cluster in the PCA world
 x = pca$x[rownames(design),1:20, drop=FALSE]
 d = apply(design[,-1], 2, function(y) mean(sqrt(rowSums((x[y>0,,drop=FALSE] - rep(colMeans(x[y>0,]),each=sum(y>0)))^2)*rep(pca$sdev[1:20]^2/sum(pca$sdev[1:20]^2), each=sum(y>0)))))
 d0 = apply(design[,-1], 2, function(y) sapply(1:1000, function(i) mean(sqrt(rowSums((x[sample(y)>0,,drop=FALSE] - rep(colMeans(x[sample(y)>0,]),each=sum(y>0)))^2)*rep(pca$sdev[1:20]^2/sum(pca$sdev[1:20]^2), each=sum(y>0))))))

 #+ PCA-spatial-clustering,  fig.width=3, fig.height=2
 par(bty="n", mgp = c(2,.33,0), mar=c(4,3,1,1)+.1, las=2, tcl=-.25, xpd=NA)
 boxplot(d0, las=2, pch=16, cex=.5, staplewex=0, lty=1, border="grey", ylab="Average distance", xaxt="n", ylim=range(d0))
 rotatedLabel(x0=1:length(d), y0 = par("usr")[3], names(d), font = ifelse(grepl("[[:upper:]]", names(d)), 3,1), cex=0.8, srt=45)
 points(d, col=set1[1], pch=19, cex=1.3)
 legend("topleft", bty="n", legend=c("Observed","Random"), pch=c(19,19), col=c(set1[1],"grey"), pt.cex=c(1,0.5))
 p <-  colMeans(d0 < rep(d, each=1000)) + 1/1000
 rotatedLabel(x0=1:length(d), y0 = par("usr")[3]-1.5, sig2star(p.adjust(p, method="BH")), cex=0.8, srt=45, pos=3)

 #' ### 3. PCA overview
 #' Now plot a large panel with small multiples of the first two PCs overlaid with the mutation status of each gene.
 #+ PCA-panel,  fig.width=3.5, fig.height=2.8
 par(bty="n", mgp = c(0,0.5,0), mar=c(1,1,1.5,0)+.1, las=1, tcl=-.25, font.main=3, mfrow=c(4,5), xpd=NA)
 i<-0
 for (geneId in colnames(design)[2:20]){
 	i<-i+1
 	plot(pca$x[rownames(design),], cex=0.5, 
 			pch=NA, 
 			xlab=ifelse(i==1,"PC1",""), ylab=ifelse(i==1,"PC2",""),main=geneId, font.main=ifelse(grepl("[[:lower:]]",geneId),1,3), cex.main=1.33, cex.lab=1.2,xaxt="n", yaxt="n", ylim=c(-55,55))
 	if(geneId != "Age")
 		w <- rownames(design)[design[,geneId] == 1]
 	else
 		w <- rownames(design)[which(design0[,geneId] > median(design0[,geneId], na.rm=TRUE))]
 	points(pca$x[!rownames(pca$x) %in% w,], pch=ifelse(is.na(design0[!rownames(pca$x) %in% w,geneId]),1,19), cex=0.5, col="grey", lwd=0.5)
 	points(pca$x[w,], pch=16, cex=0.85, col=colMutations[i], lwd=0.05)
 	u <- matrix(par("usr"), ncol=2)
 	if(i==1){
 		arrows(u[1,1],u[1,2], u[2,1],u[1,2],length=0.02)
 		arrows(u[1,1],u[1,2], u[1,1],u[2,2],length=0.02)
 	}
 	text(u[2],u[4]*.8, labels=paste("n=",length(w), sep=""), bty="n", cex=1.2, pos=2)
 }
 plot.new()
 legend("center",c("Missing","Wildtype","Mutant","Normal","Female","Age>median"), pch=c(1,19,19,19,19,19), col=c("grey","grey","black", colMutations[c("Normal","Gender","Age")]), bty="n", pt.cex=c(0.5,0.5,rep(0.85,4)),cex=1.2, pt.lwd=0.5)


 #' ### 4. Write PCA output
 annot <- select(org.Hs.eg.db, rownames(glm), c("SYMBOL","GENENAME","CHR","CHRLOC"))
 t <- cbind(annot[!duplicated(annot$ENTREZID),1:5], signif(cbind(pca$rotation, 3)))
 write.table(t[order(t[,"PC1"]),1:20], file=paste(Sys.Date(),"-PCA-rotation.txt",sep=""), sep="\t", row.names = FALSE, quote=FALSE)


 #' ### 5. GLM projection
 #' We can project the predicted expression levels for each sample back to the first PCs
 #+ glmProjection,  fig.width=2, fig.height=1.7
 par(bty="n", mgp = c(2,.33,0), mar=c(2,2.5,1,1)+.1, las=1, tcl=-.25, xpd=NA)
 glmProjection <-  t(glm$coefficients[,-1] ) %*% pca$rotation[,1:20]
 for(i in seq(1,19,2)){
 	plot(pca$x[,i + 0:1], col="grey", cex=0.25, pch=19, xlim=range(pca$x[,i])*0.8, ylim=range(pca$x[,i+1])*0.8, xlab="", ylab="")
 	u <- matrix(par("usr"), ncol=2)
 	r <- sqrt(rowSums(glmProjection[,i + 0:1]^2))
 	w <- order(r, decreasing=TRUE)[1:6]
 	arrows(0,0,glmProjection[w,i], glmProjection[w,i+1], col=colMutations[w], pch=19, length = 0.05)
 	text(glmProjection[w,i+0:1] + (glmProjection[w,i+0:1]/r[w]) * rep(diff(u)*c(0.15,0.1)*1.33, each=6), col=colMutations[w], pch=19, rownames(glmProjection)[w], font=ifelse(grepl("[[:upper:]]",rownames(glmProjection)[w]), 3,1))
 	w <- TRUE
 	points(glmProjection[w,i+0:1] , col=paste(colMutations[w],"88", sep=""), pch=16, cex=sqrt(colSums(significantGenes)/100)+.5)
 	mtext(side=1, at=u[2,1], paste("PC",i), line=1, font=2)
 	mtext(side=2, at=u[2,2], paste("PC",i+1), line=.5, font=2)
 }

 #' ### 6. Enrichment of GO Terms in PC1&2
 #' Here we test for an enrichments of GO terms using t-statistics
 #+ GO, cache=TRUE
 k = AnnotationDbi::as.list(org.Hs.egGO2EG)
 k = k[sapply(k, length)>=10]
 c <- pca$rotation[,1:2] *  sqrt(glm$s2.post)
 n <- rownames(pca$rotation)
 t = mclapply(k, function(p){
 			ids = n %in% p
 			sapply(1:ncol(c), function(j){
 						if(sum(ids)>2){
 							t <- t.test(c[ids,j], c[!ids,j], alternative="two.sided")
 							v <- var.test(c[ids,j], c[!ids,j], alternative="greater")
 							c(t$p.value, -diff(t$estimate),
 									v$p.value, v$estimate)
 						}else 
 							rep(NA,4)
 					})
 		}, mc.cores=1)
 GO.pca = matrix(unlist(t), byrow = TRUE, nrow=length(t), dimnames=list(names(t),as.vector(outer(c("pval.t","shift","pval.F","var.odds"),paste(colnames(c), sep=""), paste))))

 Term(names(sort(GO.pca[p.adjust(GO.pca[,1],"BH")<0.1,2])[1:20]))
 Term(names(sort(GO.pca[p.adjust(GO.pca[,1],"BH")<0.1,2], decreasing = TRUE)[1:20]))

 Term(names(sort(GO.pca[p.adjust(GO.pca[,5],"BH")<0.1,6])[1:20]))
 Term(names(sort(GO.pca[p.adjust(GO.pca[,5],"BH")<0.1,6], decreasing=TRUE)[1:20]))

 #+ PCA-GO,  fig.width=2.5, fig.height=2
 par(bty="n", mgp = c(2,.33,0), mar=c(3,3,1,1)+.1, tcl=-.25, xpd=NA)
 for(i in 0:1 * 2)
 	plot(GO.pca[GO.pca[,1+i]<0.05,2:1+i], log="y", cex=sqrt(sapply(k[GO.pca[,1+i]<0.05], length)/100), pch=19, col=paste(ifelse(p.adjust(GO.pca[,1+i],"BH")[GO.pca[,1+i]<0.05]<0.1,set1[3],set1[1]),"33", sep=""), lwd=0)

 write.table(data.frame(GO.id=rownames(GO.pca), 
 				GO.Term = Term(names(k)) , 
 				n.genes = sapply(k, length),
 				GO.pca),
 		file=paste(Sys.Date(),"-PCA-GO.txt", sep=""), sep="\t", row.names=FALSE, quote=FALSE)


 #' ### 7. PC rotations
 #' The rotations of the PCs reveal the more influential genes in each component
 x <- pca$rotation[,1] * sqrt(glm$s2.post)
 y <- pca$rotation[,2] * sqrt(glm$s2.post)

 #+ PCA-rotation,  fig.width=3, fig.height=3
 par(bty="n", mgp = c(2,.33,0), mar=c(3,3,2,2)+.1, las=1, tcl=-.25, xpd=NA)
 w <- sqrt(x^2 + y^2) > 0.01
 plot(x[w],y[w], pch=19, lwd=0, col=paste(set1[2],"22", sep=""), xlab="PC1", ylab="PC2")
 points(0,0,pch=19, col=set1[2], cex=8)
 up1 <- names(sort(GO.pca[p.adjust(GO.pca[,1],"BH")<0.1,2], decreasing=TRUE)[1])
 up2 <- names(sort(GO.pca[p.adjust(GO.pca[,5],"BH")<0.1,6], decreasing=TRUE)[1])
 down1 <- names(sort(GO.pca[p.adjust(GO.pca[,1],"BH")<0.1,2])[1])
 down2 <- names(sort(GO.pca[p.adjust(GO.pca[,5],"BH")<0.1,6])[1])
 rug(x[rownames(pca$rotation) %in% org.Hs.egGO2EG[[down1]]], col=set1[1], lwd=1, side=3)
 rug(x[rownames(pca$rotation) %in% AnnotationDbi::mget(up1, org.Hs.egGO2EG)], col=set1[3], lwd=1, side=3)
 rug(y[rownames(pca$rotation) %in% org.Hs.egGO2EG[[down2]]], col=set1[1], side=4, lwd=1)
 rug(y[rownames(pca$rotation) %in% org.Hs.egGO2EG[[up2]]], col=set1[3], side=4, lwd=1)
 w <- rownames(pca$rotation) %in% unlist(AnnotationDbi::mget(c(down1,up1,down2,up2),org.Hs.egGO2EG))
 text(x[w], y[w] ,  unlist(AnnotationDbi::mget(rownames(glm)[w], org.Hs.egSYMBOL)), font=3, cex=0.66)
 mtext(side=3, at=mean(x[rownames(pca$rotation) %in% org.Hs.egGO2EG[[down1]]]), Term(down1), col=set1[1] , line=-1.5)
 mtext(side=3, at=mean(x[rownames(pca$rotation) %in% org.Hs.egGO2EG[[up1]]]), Term(up1), col=set1[3] , line=0.2)
 par(las=3)
 mtext(side=4, at=mean(y[rownames(pca$rotation) %in% org.Hs.egGO2EG[[down2]]]), Term(down2), col=set1[1] , line=-1.5)
 mtext(side=4, at=mean(y[rownames(pca$rotation) %in% org.Hs.egGO2EG[[up2]]]), Term(up2), col=set1[3] , line=0.2)



 #' ### 8. F and GO
 #' Here we test wether any GO category is associated with higher F-statistics, which would indicate a stronger association of these gene sets with our mutations.
 #+ GO-F, cache=TRUE
 k = AnnotationDbi::as.list(org.Hs.egGO2EG)
 k = k[sapply(k, length)>=10]
 n <- rownames(pca$rotation)
 c <- glm$F
 t = sapply(k, function(p){
 			ids = n %in% p
 			if(sum(ids)>2){
 				t <- wilcox.test(c[ids], c[!ids], alternative="greater")
 				c(t$p.value, t$statistic/sum(ids)/sum(!ids))
 			}else 
 				rep(NA,2)
 		})
 GO.F = matrix(t(t), ncol=2, dimnames=list(colnames(t),as.vector(outer(c("pval.U","U"),paste(colnames(c), sep=""), paste))))
 colnames(GO.F) <- c("pval.U","U")
 sum(p.adjust(GO.F[,1],"BH")<0.1)

 Term(names(tail(sort(GO.F[p.adjust(GO.F[,1],"BH")<0.05,2]), 20)))
 Term(names(head(sort(GO.F[p.adjust(GO.F[,1],"BH")<0.05,1]), 20)))

 write.table(data.frame(GO.id=rownames(GO.F), 
 				GO.Term = Term(names(k)) , 
 				n.genes = sapply(k, length),
 				GO.F,
 				adj.P.Val=p.adjust(GO.F[,1], "BH")),
 		file=paste(Sys.Date(),"-GO-F.txt", sep=""), sep="\t", row.names=FALSE, quote=FALSE)

 #' ### 9. PCA stability
 #' Here we assess the stability of the PCA components using subsamples
 #+ PCA-stability, fig.width=3, fig.height=2.5
 set.seed(42)
 s <- seq(20, ncol(geneExpr), 20)
 c <- sapply(s, function(i) {
 			rowMeans(abs(sapply(1:20, function(b){
 						y <-  prcomp(t(geneExpr[,sample(1:ncol(geneExpr),i)]))$rotation[,1:20]; 
 						diag(cor(pca$rotation[,1:20],y))})))
 })
 plot(c(s,176), c(abs(c[1,]),1), type='l', col=set1[1], xlab="Samples", ylab="Abs. correlation with PC", ylim=c(0,1))
 j <- 1
 for(i in c(2,5,10,15,20)){
 	j <- j+1
 	lines(c(s,176), c(abs(c[i,]),1), type='l', col=set1[j])
 }
 legend("bottomright", c("PC1","PC2","PC5","PC10","PC15","PC20"), col= set1[1:6], lty=1, bty="n")


 #' ## 3. Functional annotatition
 #' In this section we will analyse the ENCODE annotation of those genomic regions that change in expression.

 #' ### 1. Load chromHMM data
 #+ chromHMMDownload, cache=TRUE
 tmpf <- tempfile()
 utils::download.file("http://hgdownload.cse.ucsc.edu/goldenpath/hg19/encodeDCC/wgEncodeBroadHmm/wgEncodeBroadHmmK562HMM.bed.gz", tmpf)
 tmp <- read.table(tmpf, skip=1, sep="\t")
 chromHMMK562 <- GRanges(sub("","",tmp$V1), IRanges(tmp$V2, tmp$V3), class=tmp$V4, col=factor(as.numeric(tmp$V9), labels=sapply(levels(tmp$V9), function(x) do.call("rgb", as.list(c(sapply(strsplit(x,","), as.numeric), maxColorValue = 255))))))
 tmpf <- tempfile()
 utils::download.file("http://hgdownload.cse.ucsc.edu/goldenpath/hg19/encodeDCC/wgEncodeBroadHmm/wgEncodeBroadHmmGm12878HMM.bed.gz", tmpf)
 tmp <- read.table(tmpf, skip=1, sep="\t")
 chromHMMGm12878 <- GRanges(sub("","",tmp$V1), IRanges(tmp$V2, tmp$V3), class=tmp$V4, col=factor(as.numeric(tmp$V9), labels=sapply(levels(tmp$V9), function(x) do.call("rgb", as.list(c(sapply(strsplit(x,","), as.numeric), maxColorValue = 255))))))

 colChromHmm <- as.character(chromHMMK562$col[!duplicated(chromHMMK562$class)])
 names(colChromHmm) <- (unique(chromHMMK562$class))
 colChromHmm <- colChromHmm[order(as.numeric(sub("([0-9]+)_.+","\\1", names(colChromHmm))))]
 colChromHmm[13] <- "#BBBBBB"
 colChromHmm[14:15] <- "#DDDDDD"

 #' ### 2. Target genes as GRanges
 targetGenes <- GRangesList(sapply(colnames(testResults), function(j){
 					tmp <- na.omit(AnnotationDbi::select(org.Hs.eg.db, names(which(testResults[,j]<0)), c("ENTREZID","CHR","CHRLOC", "CHRLOCEND")))
 					GRanges(paste("chr",tmp$CHR,sep=""), shift(IRanges(ifelse(tmp$CHRLOC >0 , tmp$CHRLOC, -tmp$CHRLOCEND), width=10000),-5000), entrezid=tmp$ENTREZID)
 				}))
 targetGenesUp <- GRangesList(sapply(colnames(testResults), function(j){
 					tmp <- na.omit(AnnotationDbi::select(org.Hs.eg.db, names(which(testResults[,j]>0)), c("ENTREZID","CHR","CHRLOC", "CHRLOCEND")))
 					GRanges(paste("chr",tmp$CHR,sep=""), IRanges(abs(tmp$CHRLOC), abs(tmp$CHRLOCEND)), strand=ifelse(tmp$CHRLOC>0,"+","-"), entrezid=tmp$ENTREZID)
 				}))
 targetGenesDown <- GRangesList(sapply(colnames(testResults), function(j){
 					tmp <- na.omit(AnnotationDbi::select(org.Hs.eg.db, names(which(testResults[,j]<0)), c("ENTREZID","CHR","CHRLOC", "CHRLOCEND")))
 					GRanges(paste("chr",tmp$CHR,sep=""), IRanges(abs(tmp$CHRLOC), abs(tmp$CHRLOCEND)), strand=ifelse(tmp$CHRLOC>0,"+","-"), entrezid=tmp$ENTREZID)
 				}))

 sapply(targetGenesUp, function(x){
 			y <- subsetByOverlaps(chromHMMK562,reduce(x),type="within")
 			sapply(split(y, y$class), function(z) sum(width(z)))
 		}) -> chromHmmUp
 sapply(targetGenesDown, function(x){
 			y <- subsetByOverlaps(chromHMMK562,reduce(x),type="within")
 			sapply(split(y, y$class), function(z) sum(width(z)))
 		}) -> chromHmmDown

 tmp <- na.omit(AnnotationDbi::select(org.Hs.eg.db, rownames(geneExpr), c("CHR","CHRLOC","CHRLOCEND","ENTREZID")))
 tmp <- tmp[tmp$CHR != "Un",]
 tss <- sort(GRanges(paste("chr",tmp$CHR,sep=""), shift(IRanges(ifelse(tmp$CHRLOC >0 , tmp$CHRLOC, -tmp$CHRLOCEND), width=1000),-500), entrezid=tmp$ENTREZID))

 t0 <- table(chromHMMK562[chromHMMK562 %over% reduce(tss)]$class)
 o <- order(as.numeric(sub("_.+","",names(t0))))

 #+ GE-chromHMM,  fig.width=3.3, fig.height=2
 par(bty="n", mgp = c(1.7,.33,0), mar=c(2.5,2.7,1,0)+.1, las=2, tcl=-.25)
 plot(c(0, max(b)+1),c(0,0),xlim=c(0.5,33), ylab="Megabases",las=2, ylim=c(-20,15),xlab="", xaxt="n", type="l")
 b <- barplot(chromHmmUp[o,]/1e6, col=colChromHmm, border=0.1,names.arg=rep("", ncol(chromHmmUp)), add=TRUE, yaxt="n")
 barplot(-chromHmmDown[o,]/1e6, col=colChromHmm,add=TRUE, names.arg=rep("", ncol(chromHmmUp)), yaxt="n", border=NA)
 rotatedLabel(x0=b, y0=rep(-20, ncol(chromHmmUp)), labels=colnames(chromHmmDown), cex=.7, srt=45, font=ifelse(grepl("[[:lower:]]", colnames(design))[-1], 1,3), col=colMutations)
 u <- par("usr")
 par(xpd=NA)
 text(u[1]+2,u[4], "upregulated", pos=4, cex=.7)
 text(u[1]+2,u[3]+5, "downregulated", pos=4, cex=.7)
 n <- sub("[0-9]+ ","",gsub("_"," ",rownames(chromHmmDown)[o]))
 legend("right", bty="n", fill=c(NA,colChromHmm[!duplicated(n)]), c("Chromatin state:",unique(n)), cex=.8, border=NA, y.intersp=.8, x.intersp=0.2 )

 #' ### 3. NIH roadmap histone modification data
 #' We need the following bigwig files from GEO
 files <- dir("GSE19465", pattern="GSM48670", full.names = TRUE)
 files

 #' The above files have been downloaded from GEO (http://www.ncbi.nlm.nih.gov/geo/) into the directory `GSE19465` as wiggle tracks and converted into bigwig using wigToBigWig.pl for fast random access. 
 #' The GSM identifiers will help find them.
 #+ roadmap, cache=TRUE
 getTssEnrichment <- function(files, tss){
 	sapply(files, function(f){
 				wig <- import(f,"bigwig", which=tss)
 				o <- findOverlaps(tss, wig)
 				sapply(1:length(tss), function(h)
 							sum(wig[subjectHits(o)[queryHits(o)==h]]$score))
 			})
 }
 tmp <- na.omit(AnnotationDbi::select(org.Hs.eg.db, rownames(geneExpr), c("CHR","CHRLOC","CHRLOCEND","ENTREZID")))
 tmp <- tmp[tmp$CHR != "Un",]
 tss <- sort(GRanges(paste("chr",tmp$CHR,sep=""), shift(IRanges(ifelse(tmp$CHRLOC >0 , tmp$CHRLOC, -tmp$CHRLOCEND), width=1000),-500), entrezid=tmp$ENTREZID)) ## All TSS of mapped genes +/- 500bp
 h3k27 <- list(random = getTssEnrichment(files, tss[sample(length(tss), 1000)]),
 		EZH2 = getTssEnrichment(files, tss[tss$entrezid %in% rownames(glm)[testResults[,"EZH2"]!=0]]),
 		ASXL1 = getTssEnrichment(files, tss[tss$entrezid %in% rownames(glm)[testResults[,"ASXL1"]!=0]]))


 #' Compute enrichment
 h3k27Enrichment <- sapply(h3k27, function(x)
 			log10(rowSums(x[,2:3])/50 + .5)-log10(x[,1]/50+.5))

 #+ PRC-H3K27me3,  fig.width=1.2, fig.height=1.8
 par(bty="n", mgp = c(2,0.5,0), las=1, tcl=-.25, font.main=3, mar=c(3,3,1,0.5)+.1)
 boxplot(h3k27Enrichment, yaxt="n",xaxt="n", boxwex=0.66, outline=FALSE, lty=1, outwex=0, staplewex=0, border=c("grey", colMutations[c("EZH2","ASXL1")]), log="", ylab="H3K27me3 TSS signal", ylim=c(-1,1.5))
 a <- axTicks(side=2, axp=c(0.1,1,3),log=TRUE)
 axis(side=2, at = log10(a), labels=a)
 u <- par("usr")
 rotatedLabel(1:3, rep(-1,3), c("Random", "EZH2", "ASXL1"), font=c(1,3,3))
 p <- c(t.test(h3k27Enrichment$random, h3k27Enrichment$EZH2)$p.value,
 		t.test(h3k27Enrichment$random, h3k27Enrichment$ASXL1)$p.value)
 axis(3, at=c(1,2), labels=NA, tcl=0.25, line=-.5)
 mtext(side=3, at=1.5,  sig2star(p[1]), line=-.85)
 axis(3, at=c(1,3), labels=NA, tcl=0.25, line=0)
 mtext(side=3, at=2,  sig2star(p[2]), line=-.35)


 #+ PRC-Expression,  fig.width=1.4, fig.height=1.8
 par(bty="n", mgp = c(1.75,0.5,0), las=1, tcl=-.25, font.main=3, mar=c(1,2.5,1,0.5)+.1)
 boxplot(glm$coefficients[,1],
 		glm$coefficients[testResults[,"EZH2"]!=0,1],
 		glm$coefficients[testResults[,"EZH2"]!=0,1]+glm$coefficients[testResults[,"EZH2"]!=0,"EZH2"],
 		glm$coefficients[testResults[,"ASXL1"]!=0,1], 
 		glm$coefficients[testResults[,"ASXL1"]!=0,1] + 	glm$coefficients[testResults[,"ASXL1"]!=0,"ASXL1"], 
 		xaxt="n", boxwex=0.66, outline=TRUE, lty=1, outwex=1, staplewex=0, pch=16, cex=.5,
 		border=c("grey", colMutations[rep(c("EZH2","ASXL1"), each=2)]), log="", ylab="Average target expression", ylim=c(1,16))
 segments(2,		glm$coefficients[testResults[,"EZH2"]!=0,1],
 		3, glm$coefficients[testResults[,"EZH2"]!=0,1]+glm$coefficients[testResults[,"EZH2"]!=0,"EZH2"],
 		col=paste(colMutations["EZH2"],"22",sep=""), lwd=.5,
 )
 segments(4,		glm$coefficients[testResults[,"ASXL1"]!=0,1],
 		5, glm$coefficients[testResults[,"ASXL1"]!=0,1]+glm$coefficients[testResults[,"ASXL1"]!=0,"ASXL1"],
 		col=paste(colMutations["ASXL1"],"22",sep=""), lwd=.5,
 )
 u <- par("usr")
 axis(1, at=c(2,3), labels=c("wt","mt"), tcl=-0.25, line=-9, cex=.7)
 axis(1, at=c(4,5), labels=c("wt","mt"), tcl=-0.25, line=-9, cex.lab=.3)
 mtext(side=1,at=c(1, 2.5, 4.5), c("Any","EZH2","ASXL1"), line=-.5, font=c(1,3,3))
 p <- c(t.test(glm$coefficients[,1],glm$coefficients[testResults[,"EZH2"]!=0,1])$p.value,
 		t.test(glm$coefficients[,1],glm$coefficients[testResults[,"ASXL1"]!=0,1])$p.value)
 axis(3, at=c(1,2), labels=NA, tcl=0.25, line=-.5)
 s <- sig2star(p, breaks=10^c(-Inf,-3:0), labels=c("***","**","*",""))
 mtext(side=3, at=1.5,  s[1], line=-.85, col=)
 axis(3, at=c(1,4), labels=NA, tcl=0.25, line=0)
 mtext(side=3, at=3,  s[2], line=-.35)

 #' ### 4. HB expression
 #' The HGA and HGB loci show strong differences associated with mutations in SF3B1 and STAG2
 #+ SF3B1-STAG2-HB,  fig.width=1.4, fig.height=1.8
 par(bty="n", mgp = c(1.5,0.33,0), las=1, tcl=-.25, font.main=3, mar=c(3,2.5,1,3.5)+.1)
 w <- intersect(unlist(AnnotationDbi::mget( grep("^hemoglobin,", Rkeys(org.Hs.egGENENAME), value=TRUE),revmap(org.Hs.egGENENAME))), rownames(glm))
 x <- c(glmProjection[c("SF3B1","Normal","STAG2"),1],0)[c(1,4,2,3)]
 y <- cbind(glm$coefficients[w,c("SF3B1","Normal","STAG2")], 0)[,c(1,4,2,3)] + glm$coefficients[w,"offset"]
 plot(x,y[1,], pch=NA, xlab="", ylab="Predicted expression", ylim=range(y), xaxt="n")
 u <- par("usr")
 axis(side=1, at=x, labels=NA)
 rotatedLabel(x, rep(u[3],4), c("SF3B1","MDS","Normal", "STAG2"), font=c(3,1,1,3), col=c(colMutations["SF3B1"],"#AAAAAA", colMutations[c("Normal","STAG2")]))
 for(i in 1:nrow(y)){

 	lines(x, y[i,], col=set1[1])
 	o <- sqrt(c(glm$stdev.unscaled[w[i],c("SF3B1","Normal","STAG2")],0)[c(1,4,2,3)]^2 + glm$stdev.unscaled[w[i],1]^2)
 	polygon(c(x,rev(x)), c(y[i,] + 2*o, rev(y[i,]) - 2*rev(o)), border=NA, col=paste(set1[1], "22",sep=""))
 }
 par(xpd=NA)
 segments(x[4] + (u[2]-u[1])*.05, y[,4], x[4] + (u[2]-u[1])*.15, rank(y[,4]))
 mtext(side=4, at=rank(y[,4]), text=select(org.Hs.eg.db, w, "SYMBOL")$SYMBOL, font=ifelse(select(org.Hs.eg.db, w, "CHR")$CHR==16, 4,3), las=2, cex=.7, line=1)



 #' ## 4. Clinical regression
 mdsSamples <- mdsData$GEOID[ix]
 subs <- mdsData[ix, grep("SF3B1|TET2|SRSF2|ASXL1|DNMT3A|RUNX1|U2AF1|TP53|EZH2|IDH2|STAG2|ZRSR2|CBL|BCOR|NRAS|JAK2|CUX1|IDH1|KRAS|PHF6|EP300|GATA2|NPM1|MLL2|PTPN11|CREBBP|KIT|MPL|NF1|WT1|IRF1|RAD21|ATRX|CDKN2A|ETV6|KDM6A|CEBPA|FLT3|GNAS|PTEN|SH2B3|BRAF|CTNNA1", colnames(mdsData))] # oncogenic mutations
 subs <- subs[,colSums(subs)>2]

 #' ### 1. Define variable categories
 Z <-  list()
 Z$expression = scale(pca$x[mdsSamples,1:20])
 Z$genetics = scale(subs+0, scale=FALSE)
 Z$cytogenetics = scale(cytoImputed, scale=FALSE) #data[match(rownames(design), mds_clin$PDID),c("chr3","del5q","del7_7q","tri8","del11","del12","alt17q","tri19","del20q","complex")],
 Z$geneticsExpression = cbind(Z$genetics,Z$expression)
 Z$blood = scale(mdsData[ix,c("PB_cytopenia","plt_log","Haemoglobin","bm_blasts_logit","ring_sideroblasts_logit")])
 Z$demographics = scale(mdsData[ix,c( "Gender", "Age")])
 Z$clinical = cbind(Z$blood, Z$demographics)
 Z$geneticsCytogenetics = cbind(Z$genetics, Z$cytogenetics)
 Z$geneticsCytogeneticsExpression = cbind(Z$geneticsCytogenetics, Z$expression)
 Z$expressionClinical = cbind(Z$expression, Z$clinical)
 Z$geneticsClinical = cbind(Z$genetics, Z$clinical)
 Z$geneticsExpressionClinical = cbind(Z$geneticsExpression, Z$clinical)
 Z$all = cbind(Z$geneticsCytogenetics, Z$expression, Z$clinical)

 #' ### 2. Compute LASSO regression
 Y <- cbind(mdsData[ix,c("PB_cytopenia","plt_log","Haemoglobin","bm_blasts_logit","ring_sideroblasts_logit", "Gender", "Age")],
 				(mdsData[ix,c("anc_log","ME_ratio", "Serum_ferritin" )]))
 colnames(Y) <- c("Cytopenia", "Platelets","Haemoglobin","Bonemarrow blasts","Ringed sideroblasts","Gender","Age","Absolute neutrophil count","M:E ratio","Serum ferritin")
 Y <- Y[,order(colnames(Y))]
 X <- scale(Z$geneticsCytogeneticsExpression, scale=TRUE)
 for(j in 1:ncol(X))
 	X[is.na(X[,j]),j] <- mean(X[,j], na.rm=TRUE)
 set.seed(42)
 clinModels = lapply(Y, function(y){
 			if (class(y) %in% c("numeric","integer")){
 				if(all(y %in% c(0,1,NA)))
 					cv.glmnet(X[!is.na(y),], na.omit(y), family = "binomial", alpha=1, standardize=FALSE, nfolds=5)
 				else if(all(y %in% c(0,20,NA)))
 					cv.glmnet(X[!is.na(y),], na.omit(y), family = "poisson", alpha=1, standardize=FALSE, nfolds=5)
 				else
 					cv.glmnet(X[!is.na(y),], na.omit(y), family = "gaussian", alpha=1, standardize=FALSE, nfolds=5)
 			}
 			else if (class(y)=="factor")
 				cv.glmnet(X[!is.na(y),], na.omit(y), family="multinomial",  alpha=1, standardize=FALSE, nfolds=5)
 		})

 #' #### Plot
 #+ Clinical-glmnet,  fig.width=5, fig.height=2, fig.keep="high"
 par(bty="n", mgp = c(2.5,.5,0), mar=c(2.5,4,2,4)+.1, las=2, tcl=-.25)
 i = 1
 n <- colnames(Z$geneticsCytogeneticsExpression)
 annot <- 1 + grepl("^[A-Z]",n) + grepl("PC",n)
 names(annot) <- n
 for(m in clinModels){
 	plotcvnet(m, Z$geneticsCytogeneticsExpression, main=names(clinModels)[i],  col0="black", cex=1, simple.annot = annot, col=set1[c(3,2,4)])
 	i = i+1
 	legend("topright", col=c(set1[c(1,3)],"black")[c(1,3,2)], c(expression(paste("Explained variance ",Rgenetics^2)), expression(paste("Lasso penalty ",lambda)), expression(paste("Model coefficient ", beta))), box.lty=0, bg="#FFFFFF33", pch=c(NA,NA,19), lty=c(1,1,NA), cex=.8, pt.cex = 1)
 }

 #' #### Heatmap of GLMs
 #' Compute a matrix with all coefficients as well as the R2
 j <- 0
 z <- sapply(clinModels,function(x){
 			j <<- j+1
 			w <- which.min(x$cvm)
 			c <- x$glmnet.fit$beta[,w]
 			yj <- sapply(c("genetics","cytogenetics","expression"), function(i){
 						w <- colnames(Z[[i]])
 						X[,w] %*% c[w] 
 					})
 			cj <- rowSums(cov(yj))
 			y <- Y[,j] - x$glmnet.fit$a0[w]
 			covj <- colMeans((y-mean(y))*(yj - rep(colMeans(yj), each=nrow(yj))))
 			r2 <- cj
 			R2 <- 1 - x$cvm[w]/x$cvm[1]
 			c(c, NA,  r2/sum(r2), R2=R2)
 		})
 m <- nrow(z) #ncol(Z$geneticsCytogeneticsExpression)
 z[1:m,] <- pmin(z[1:m,],1)
 z[1:m,] <- pmax(z[1:m,],-.999)
 z[z==0] <- NA

 #' These are the ranks
 r <- sapply(clinModels, function(x) rank(apply(x$glmnet.fit$beta,1, function(y) which(y!=0)[1]), ties="min"))
 s <- sapply(clinModels, function(x) {
 			w = which.min(x$cvm)
 			w <- rev(which(x$cvm[1:w] > x$cvup[w]))[1] +1
 			if(!is.na(w))
 				sum(x$glmnet.fit$beta[,w]!=0)
 			else
 				0
 		})
 p <- sapply(clinModels, function(x) {w <- which.min(x$cvm); (1 - x$cvup[w]/x$cvm[1]) > 0 })

 #' The explained variance R2
 R2 <- z[nrow(z) - 3:0,]
 R2[is.na(R2)] <- 0
 z <- z[-(nrow(z) - 0:4),]
 z <- z[,ncol(z):1]
 r <- r[,ncol(r):1]

 #' Plot
 #+ Clinical-glmnet-heatmap,  fig.width=8, fig.height=2
 layout(matrix(c(1,2,3),1,3), c(6.5,.75,.75), 2, TRUE)
 par(bty="n", mgp = c(3,.5,0), mar=c(4,10,2,0)+.1, las=1, tcl=-.25, cex=1)
 w <- TRUE#rev(c("ring_sideroblasts_logit","bm_blasts_logit","hb", "plt_log", "pb_cytopenia", "age","sex"))
 image(y=1:ncol(z)-.5, x=1:nrow(z), z[,w], breaks=c(-2,seq(-1,1,l=51)), col=c("grey",colorRampPalette(brewer.pal(9,"RdYlBu"))(50)), xaxt="n", yaxt="n", xlab="", ylab="", ylim=c(0,10))
 abline(v=c(18.5,27.5), lwd=0.5)
 rotatedLabel(y0=rep(0.5,nrow(z)), labels=sub("_","/",rownames(z)), x0=1:nrow(z), font=c(rep(3,18),rep(1,nrow(z)-18)), col = set1[c(3,2,4)][annot], cex=0.9)
 mtext(side=2, line=.2, text=colnames(z), las=2, at=1:ncol(z)-.5)
 text(y=rep(1:ncol(z)-.5, each=nrow(r)), x=rep(1:nrow(r), ncol(z)), r[,w] * (0!=(z[1:nrow(r),w])), cex=0.66, font=ifelse(r <= rep(s[ncol(r):1], each=nrow(r)), 2,1))
 points(y=rep(1:ncol(z)-.5, each=nrow(r)), x=rep(1:nrow(r), ncol(z)), pch=ifelse(is.na(z) | z==0, ".",NA))
 mtext(side=1, at=9, "Genetics", col=set1[2], line=2.5 )
 mtext(side=1, at=23, "Cytogenetics", col=set1[3], line=2.5 )
 mtext(side=1, at=38, "Expression", col=set1[4], line=2.5 )
 mtext(side=3, "Model coefficients", at = 14, line=0.5)
 clip(-10,50,0,15)
 image(y=dim(z)[2] + c(0.5,1.5), x=1+ 1:7 , matrix(seq(-0.99,1,l=7), ncol=1), breaks=c(-2,seq(-1,1,l=51)), col=c("grey",colorRampPalette(brewer.pal(9,"RdYlBu"))(50)), xaxt="n", yaxt="n", xlab="", ylab="", add=TRUE)
 text(y=dim(z)[2]+1, x=c(1,9), c(-1,1))
 points(y=11,x=5, pch=".")
 rect(19.5,10.5,20.5,11.5, lwd=0.5)
 text(20,11,1, cex=0.66)
 text(21, 11, "LASSO rank", pos=4)
 u <- par("usr")
 par(bty="n", mgp = c(3,.5,0), mar=c(4,1,2,0)+.1, las=1, tcl=-.25)
 plot(NA,NA, xlab="", ylab="", xaxt="n", yaxt="n", ylim=c(0,10), xlim=c(0,1), yaxs="i")
 barplot(R2[1:3,ncol(R2):1], border=NA, col=paste(set1[c(2,3,4)],"88",sep=""), horiz=TRUE, names.arg=rep(NA,ncol(R2)), width=0.95, space=0.0525, add=TRUE)
 mtext(side=3, "Variance components", line=.5)
 par(bty="n", mgp = c(3,.5,0), mar=c(4,1,2,1)+.1, las=1, tcl=-.25)
 plot(NA,NA, xlab="", ylab="", xaxt="n", yaxt="n", ylim=c(0,10), xlim=c(0,.9), yaxs="i")
 barplot(R2[4,ncol(R2):1], border=NA, col="grey", horiz=TRUE, names.arg=rep(NA,ncol(R2)), width=0.95, space=0.0525, add=TRUE) -> b
 points(R2[4,ncol(R2):1]+0.1,b, pch=ifelse(rev(p),"*",NA))
 mtext(side=1, expression(paste("Explained variance ",R^2)), line=2.5)


 #' #### Prediction of ringed sideroblasts
 #' As an example of the LASSO fits, show the prediction vs. observations of the proportion of ringed sideroblasts.
 #+ Clinical-glmnet-RS,  fig.width=2, fig.height=1.8
 par(bty="n", mgp = c(1.5,.33,0), mar=c(2.5,3.5,2,1)+.1, las=1, tcl=-.25)
 w <- which(clinModels$`Ringed sideroblasts`$glmnet.fit$beta[,which.min(clinModels$`Ringed sideroblasts`$cvm)]!=0)
 X <- scale(Z$geneticsCytogeneticsExpression, scale=FALSE)
 for(j in 1:ncol(X))
 	X[is.na(X[,j]),j] <- mean(X[,j], na.rm=TRUE)
 set.seed(42)
 l <- lm(Y$`Ringed sideroblasts` ~ X[,w])
 y <- na.omit(Y$`Ringed sideroblasts`)
 x <- predict(l)
 invlogit <- function(x) 1/(1+exp(-x))
 plot(x, y, xaxt="n", yaxt="n", pch=16, cex=0.8, xlab="", ylab="", xlim=car::logit(c(0.01,0.95)), ylim=car::logit(c(0.025,0.95)))
 title(xlab="Predicted ringed sideroblasts")
 title(ylab="Observed ringed sideroblasts", line=2.5)
 abline(0,1)
 u <- par("usr")
 par(xpd=NA)
 y <- l$coefficients[-1]+ l$coefficients[1]
 names(y) <- colnames(X)[w]
 u <- par("usr")
 x0 <- rep(u[4]-0.5,length(y))
 names(x0) <- names(y)
 y0 <- u[4] + 0.05*(u[4]-u[3]) - rank(-y)/length(y) * (u[4]-u[3])/1.2
 d <- density(y)
 lines(d$x, d$y/2-0.5 +u[4], col="grey")
 lines(d$x, -d$y/2-0.5+u[4], col="grey")
 points(x=y, y=x0+violinJitter(y, magnitude=1)$y, pch=16, col=set1[c(3,2,4)][annot][w]) #annot[w]/10-0.2
 w <- r[w,"Ringed sideroblasts"]
 w <- "SF3B1"#names(w[w<=5])
 rotatedLabel(y[w], x0[w], labels=(w), font=ifelse(grepl("PC", w),1,3), cex=.66, pos=3, col=set1[c(3,2,4)][annot[(w)]])
 axis(at=-1:1 + l$coefficients[1], labels=-1:1, side=3, cex.axis=.8)
 mtext(at=l$coefficients[1], line=1, side=3, "Coefficients", cex=.8)
 text(u[2],u[3]+1, expression(paste(Rgenetics^2==0.55)), pos=2)
 par(xpd=FALSE)
 car::probabilityAxis(side="left", axis.title = "")
 car::probabilityAxis(side="below", axis.title = "")




 #' ## 5. Survival analysis
 #' Here we analyse the influence of mutations, expression and blood counts on survival
 #' ### 1. Prepare data
 amlFreeSurvival <- Surv(time=mdsData$Survival_days, event=mdsData$Status)
 amlFreeSurvival[!is.na(mdsData$AML_progression_days)] <- Surv(time=mdsData$AML_progression_days, event=mdsData$AML_status)[!is.na(mdsData$AML_progression_days)]
 amlFreeSurvival[mdsData$AML_progression_days < 0 | mdsData$Survival_days < 0] <- NA
 amlFreeSurvival <- amlFreeSurvival[ix]
 amlFreeSurvival[,1] <- amlFreeSurvival[,1] /365*12 # Convert to months

 #' #### Model
 #' We use the following variant of the Cox proportional hazards model to estimate the survial models
 ecoxph <- function(X, surv, tol=1e-3, max.iter=50){
 	if(class(X)=="data.frame")
 		X = as.matrix(X)
 	beta0 = rep(0,ncol(X))
 	beta1 = rep(1,ncol(X))
 	sigma2 = 1
 	iter = 1
 	while(max(abs(beta1-beta0))>tol& iter < max.iter){
 		fit = coxph(surv ~ ridge(X, theta=1/sigma2, scale=FALSE))
 		sigma2 = (1 + sum((fit$coefficients-mean(fit$coefficients))^2))/(ncol(X))	
 		beta0 = beta1
 		beta1 = fit$coefficients
 		#cat(beta1,"\n")
 		#cat(sigma,"\n")
 		iter = iter+1
 	}
 	fit$sigma2 = sigma2
 	names(fit$coefficients) = colnames(X)
 	return(fit)
 }
 #' This model uses a Gaussian prior on the coefficients. The variance of the prior is estimate by empirical Bayes.
 #' The shared prior introduces a ridge penalty on the parameters, which help stabilise the estimates.

 #' #### Impute missing Z by mean..
 for(i in names(Z)){
 	for(j in 1:ncol(Z[[i]])){
 		Z[[i]][is.na(Z[[i]][,j]),j] <- mean(Z[[i]][,j], na.rm=TRUE)
 	}
 }

 #' #### Concordance of models based on different covariates
 #' Use five-fold cross validation
 set.seed(42)
 concordanceCV = data.frame()
 cv_ix = sample(1:5, length(mdsSamples), replace=TRUE)
 for(i in unique(cv_ix)){
 	c = lapply(Z, function(x) ecoxph(x[cv_ix!=i,], amlFreeSurvival[cv_ix!=i] ))
 	p = mapply(function(x,y) as.matrix(x[cv_ix==i,]) %*% coef(y), Z, c)
 	concordanceCV = rbind(concordanceCV, apply(-p,2, rcorr.cens, amlFreeSurvival[cv_ix==i])[1,])
 }
 colnames(concordanceCV) = names(Z)
 colMeans(concordanceCV)

 ipss <- mdsData$ipss[ix]
 hIpss <- rcorr.cens(-ipss,amlFreeSurvival)[1]

 #' Barplots
 #+ GE-H-barplots,  fig.width=2, fig.height=2
 par(bty="n", mgp = c(2,.33,0), mar=c(5,3,1,1)+.1, las=1, tcl=-.25, xpd=NA)
 h <- concordanceCV[,c("genetics","cytogenetics","expression","blood","demographics","all")]
 v <- sapply(h, sd)/sqrt(5)
 colnames(h) <- ""
 b <- barplot(c(colMeans(h),hIpss), border=NA, col=paste(c(set1[c(2,3,4,1,5,7)],"#BBBBBB"),"88", sep=""), las=2, ylim=c(0,0.8), ylab="Harrel's C", names.arg=rep("",7))
 points(rep(b[-7], each=5) + seq(-.1,.1,l=5), unlist(h), pch=16, cex=.5, col="darkgrey")
 segments(x0=b[-7], y0=colMeans(h)-v, y1=colMeans(h)+v)
 rotatedLabel(b, rep(0,7), c("Genetics","Cytogenetics","Expression","Blood counts","Demographics","All","IPSS"))

 #' Kaplan-Meier plots of risk terciles
 #+ GE-KM,  fig.width=2.1, fig.height=1.8,  fig.show='asis'
 par(bty="n", mgp = c(1.9,.33,0), mar=c(2.3,3,2,2.1)+.1, las=1, tcl=-.25)
 j <- 0
 for(x in Z){
 	j <- j +1
 	c = sapply(1:5, function(i) coef(ecoxph(x[cv_ix!=i,],amlFreeSurvival[cv_ix!=i])))
 	r = as.matrix(x) %*% rowMeans(c) 
 	plot(survfit(amlFreeSurvival ~ cut(r, quantile(r+runif(length(r), -0.01,0.01), seq(0,1,l=4), na.rm=TRUE), right=FALSE)), col = set1[3:1], xlab="", ylab="AML-free survival")
 	title(xlab="Months", line=1.5)
 	mtext(capitalize(names(Z)[j]), font=2, at=40, side=3, line=0.5)
 	par(xpd=TRUE)
 	legend(x=60, y=1.2, bty="n", "Risk", cex=0.9, text.font=2)
 	legend(x=60, y=1.1, bty="n", text.col = set1[3:1], c("Low","Intermediate","High"), cex=0.9)
 	text(x=0, y=0.05, paste("C =", round(colMeans(concordanceCV)[j],2)), cex=1, pos=4)
 }
 	


 #' ### 2. Risk contributions
 #' Covariates once more
 X <-  data.frame(
 		Genetics = Z$genetics,
 		Cytogenetics = Z$cytogenetics,
 		Expression= Z$expression,
 		`Blood counts` = Z$blood,
 		Demographics =  Z$demographics
 )

 #' #### Fit single model
 groups <- sub("\\..+","",colnames(X))
 model <- ecoxph(X, amlFreeSurvival)
 index <- TRUE
 r <- sapply(unique(groups[index]), function(x) {
 			ix <- groups[index] == x
 			as.matrix(X[,index][,ix, drop=FALSE]) %*% coef(model)[ix] #+ fullModel$sumX[,x] * fullModel$mu[x]
 		})

 c <- cov(r, use="complete")
 x <-  colSums(c)/sum(c)#diag(c / sum(diag(c))) #
 x <- x - sum(x[x < 0])
 col0 <- c(paste(set1, "88", sep="")[c(3,2,4,1,5)], "grey")
 pie(x, col=col0, border=NA, labels = paste(names(x), " (",round(100*x),"%)", sep=""))
 polygon(cos(seq(0,2*pi,l=100))*.5, sin(seq(0,2*pi,l=100))*.5, col="white", border=NA)

 C <- rcorr.cens(-rowSums(r), amlFreeSurvival)[1]
 polygon(c(sin(seq(0,2*pi *C,l=100) +(1-C)*pi)*.4,0), c(cos(seq(0,2*pi *C,l=100)+(1-C)*pi)*.4,0), col="grey", border=NA)
 text(0,-0.1, paste("C=",round(C,2), sep=""), col="white", pos=1)

 #' #### Five-fold cross validation
 #' It may be better to use cross-validation, especially for the estimate of concordance
 C <- NULL
 y <- NULL
 set.seed(42)
 cv_ix = sample(1:5, length(mdsSamples), replace=TRUE)

 for(i in 1:5){
 	fit = ecoxph(X[cv_ix!=i,index],amlFreeSurvival[cv_ix!=i], tol=1e-6)
 	c = coef(fit)
 	C = c(C,rcorr.cens(-as.matrix(X[cv_ix==i,index][,groups!="Nuisance"]) %*% c[groups!="Nuisance"] , amlFreeSurvival[cv_ix==i])[1])
 	r <- sapply(unique(groups[index]), function(x) {
 				ix <- groups[index] == x
 				as.matrix(X[cv_ix!=i,index][,ix, drop=FALSE]) %*% coef(fit)[ix] #+ fullModel$sumX[,x] * fullModel$mu[x]
 			})
 	
 	c <- cov(r, use="complete")
 	x <-  colSums(c)/sum(c)#diag(c / sum(diag(c))) #
 	y <- rbind(y,x)
 }
 meanC <- mean(C)
 x <- colMeans(y)
 x <- colSums(c)/sum(c)
 x <- x - sum(x[x<0])
 pi <- base::pi

 #+ Cox-Variance,  fig.width=2.2, fig.height=2
 par(bty="n", mgp = c(2,.33,0), mar=c(1,3,1,3)+.1, las=2, tcl=-.25)
 pie(x, col=col0, border=NA, labels = paste(names(x), " (",round(100*x),"%)", sep=""), radius=.8, init.angle=179)
 title(main="Survival risk contributions", font.main=1, cex.main=1)
 polygon(cos(seq(0,2*pi,l=100))*.5, sin(seq(0,2*pi,l=100))*.5, col="white", border=NA)
 polygon(c(sin(seq(0,2*pi *meanC,l=100) +(1-meanC)*pi)*.4,0), c(cos(seq(0,2*pi *meanC,l=100)+(1-meanC)*pi)*.4,0), col="grey", border=NA)
 text(0,0, paste("C=",round(meanC,2), sep=""), col="white", pos=1)


 #' ### 3. Random Survival Forest
 #' Here we compare the results of the previous section to a random survival forest
 #' #### Fit model
 rsf <- rfsrc(Surv(time,status) ~ ., data=cbind(time=amlFreeSurvival[,1], status=amlFreeSurvival[,2], X) )
 col=set1[c(3,2,4,1,5)]#  c(brewer.pal(8, "Dark2")[1:3], brewer.pal(8, "Set1")[2:1])

 #' #### Variable importance
 #+ RSF-boxplot,  fig.width=1.5, fig.height=2
 par(bty="n", mgp = c(2,.33,0), mar=c(4,3,1,0.5)+.1, las=2, tcl=-.25, las=3, xpd=NA)
 boxplot(rsf$importance ~ factor(groups, levels=unique(groups)), border= col, staplewex=0, pch=16, cex=0.75, ylab="Variable importance", lty=1, xaxt="n")
 rotatedLabel(x0=1:5, y0=rep(-0.002,5), labels=unique(groups), srt=45)
 #' The plot confirms the result that expression, blood counts, and clinical variables are most influental.

 #' Another plot
 #+ RSF-importance,  fig.width=6, fig.height=2
 par(bty="n", mgp = c(2,.33,0), mar=c(3,3,1,0.5)+.1, las=2, tcl=-.25, las=3, lend=1, xpd=NA)
 o <- order(rsf$importance, decreasing = TRUE)
 plot(rsf$importance[o], type="h", col=col[factor(groups, levels=unique(groups))[o]], ylab="Variable importance", xaxt="n", xlab="", lwd=7)
 legend("topright", col=col, lty=1, unique(groups), bty="n", lwd=3)

 #+ RSF-variables,  fig.width=4, fig.height=3, warning=FALSE
 par(bty="n", mgp = c(2,.33,0), mar=c(1,3,1,3)+.1, las=2, tcl=-.25)
 plot.variable(rsf)

 #+ RSF,  fig.width=6, fig.height=6, warning=FALSE
 par(cex=0.75)
 plot(rsf)

 #' ## Session
 sessionInfo()
diff --git a/ncomms6901-s5.R b/ncomms6901-s5.R
 #' Supplementary Data 4
 #' =======================
 #' ### Supplementary code and figures accompanying *Interconnections among mutations, gene expression, clinical variables and patient outcome in myelodysplastic syndromes*
 #' 
 #' This document contains the complete code used in the TCGA AML analysis. It is purely written in `R` using a series of `R` and `Bioconductor` packages. 
 #' This report has been generated using the `knitr` R package (http://yihui.name/knitr/). 
 #' For a complete list of packages and their versions please have a look at the end of this document.

 #+ Preliminaries, echo=FALSE
 options(width=120)
 pdf.options(pointsize=8)
 knit_hooks$set(smallMar = function(before, options, envir) {
 			if (before) par(mar=c(3,3,1,1), bty="n", mgp=c(2,.5,0)) 
 		})
 opts_chunk$set(dev=c('my_png','pdf'), fig.ext=c('png','pdf'), fig.width=3, fig.height=3, smallMar=TRUE)
 my_png <-  function(file, width, height, pointsize=12, ...) {
 	png(file, width = 1.5*width, height = 1.5*height, units="in", res=72*1.5, pointsize=pointsize, ...)
 }
 #opts_knit$set(root.dir = file.path(getwd(),".."))

 #' TCGA AML expression data analysis
 #' ===========================

 #' #### Libraries
 library(limma)
 library(org.Hs.eg.db)
 library(hgu133plus2.db )
 library(RColorBrewer)
 set1 <- brewer.pal(9,"Set1")
 library(cgdsr)
 library(CoxHD) ## From github.com/mg14/CoxHD
 library(mg14) ## From github.com/mg14/mg14
 library(xtable)
 library(Hmisc)

 #' 1. Differential expression analysis
 #' ----------------------
 #' 
 #' ### 1.1 Load data 
 #' #### Get entrez ids for genes of interest, take all ids from the hg133plus array to have comparable data.
 entrez <-  unique(AnnotationDbi::select(hgu133plus2.db, keys = keys(hgu133plus2.db), columns = c("ENTREZID"))$ENTREZID)

 #' #### Load expression data from cBio portal
 #+ cBioData, cache=TRUE
 mycgds <-  CGDS("http://www.cbioportal.org/public-portal/")
 tcgaAML <- getCancerStudies(mycgds)[1,1]
 cases <- getCaseLists(mycgds,tcgaAML)[8,1]
 g <-  lapply(split(as.numeric(entrez), seq_along(entrez)%/%500), function(genes) getProfileData(mycgds,genes,getGeneticProfiles(mycgds,tcgaAML)[2,1],cases)) ## load in batches of 500
 g <- do.call("cbind", g)
 tcgaExpr <- log(t(g)+.5)
 rownames(tcgaExpr) <- sub("\\.","-",sub("^[0-9]+\\.","",rownames(tcgaExpr))) ## Fix rownames
 #rm(e,s,g)

 #' The query will have dropped a few genes. Match back to entrezids...
 s <- AnnotationDbi::select(org.Hs.eg.db, rownames(tcgaExpr), "ENTREZID", "SYMBOL")
 m <- match(rownames(tcgaExpr), s$SYMBOL)
 sum(is.na(m))
 rownames(tcgaExpr) <- s$ENTREZID[m]
 colnames(tcgaExpr) <- gsub("\\.","-",colnames(tcgaExpr))
 tcgaExpr <- tcgaExpr[rowSums(is.na(tcgaExpr))==0 & !is.na(rownames(tcgaExpr)),]

 #' Check the log2 FPKM densities across samples
 #+ TCGA-logFPKM, fig.width=2, fig.height=1.8
 plot(density(tcgaExpr[,1]))
 for(j in 1:ncol(tcgaExpr)) lines(density(tcgaExpr[,j]))

 #' #### Load curated clinical and mutation data
 tcgaClinical <- read.table("SuppTableS3_Clinical.txt", sep="\t", header=TRUE, comment.char = "#") ## tab-delimietd version of Supplementary Table S3, Clinical sheet.
 colnames(tcgaClinical)[3:12] <- capitalize(colnames(tcgaClinical)[3:12])
 rownames(tcgaClinical) <- as.character(tcgaClinical$TCGA_ID)
 tcgaGenetic <- read.table("SuppTableS3_Genetic.txt", sep="\t", header=TRUE, comment.char = "#") ## tab-delimietd version of Supplementary Table S3, Genetics sheet.
 tcgaGenetic$TCGA_ID <- factor(as.character(tcgaGenetic$TCGA_ID), levels = levels(tcgaClinical$TCGA_ID))
 g <- as.character(tcgaGenetic$Hugo_Symbol)
 g[tcgaGenetic$Hugo_Symbol=="FLT3" & tcgaGenetic$Variant_Type == 'INS'] <- "FLT3_ITD"
 g[tcgaGenetic$Hugo_Symbol=="FLT3" & tcgaGenetic$Variant_Type == 'SNP'] <- "FLT3_TKD"
 tcgaMutation <- (table(tcgaGenetic$TCGA_ID,g)) > 0

 #' ### 1.2 Compute PCA
 tcgaPca <- prcomp(t(tcgaExpr))

 #' #### Covariates
 t <- cbind(tcgaMutation +0, tcgaClinical[,14:24])
 t <- t[,colSums(t[colnames(tcgaExpr),],na.rm=TRUE)>=5]
 tcgaCovariates <- as.matrix(cbind(Offset=1,t, Gender=tcgaClinical$Gender, Age=tcgaClinical$AOD/10))[colnames(tcgaExpr),]
 groups <- factor(c("Offset", rep("Genetics",22), "Translocations",rep("CNA", 5), rep("Translocations",2), rep("Demographics",2)), levels=c("Offset","Genetics","CNA","Translocations","Demographics"))
 tcgaCovariates <- tcgaCovariates[,order(groups)]
 groups <- groups[order(groups)]
 col1 <- c("grey",set1[c(3,5,2,7)])
 names(col1) <- levels(groups)


 #' #### PCA overview
 #' Now plot a large panel with small multiples of the first two PCs overlaid with the mutation status of each gene.
 #+ TCGA-PCA-panel,  fig.width=5, fig.height=4
 par(bty="n", mgp = c(0,0.5,0), mar=c(1,1,1.5,0)+.1, las=1, tcl=-.25, font.main=3, mfrow=c(6,6), xpd=NA)
 i<-0
 for (geneId in colnames(tcgaCovariates)[-1]){
 	i<-i+1
 	plot(tcgaPca$x[rownames(tcgaCovariates),], cex=0.5, 
 			pch=NA, 
 			xlab=ifelse(i==1,"PC1",""), ylab=ifelse(i==1,"PC2",""),main=geneId, font.main=ifelse(grepl("[[:lower:]]",geneId),1,3), cex.main=1.33, cex.lab=1.2,xaxt="n", yaxt="n", ylim=c(-65,65))
 	if(geneId != "Age")
 		w <- rownames(tcgaCovariates)[which(tcgaCovariates[,geneId] == 1)]
 	else
 		w <- rownames(tcgaCovariates)[which(tcgaCovariates[,geneId] > median(tcgaCovariates[,geneId], na.rm=TRUE))]
 	points(tcgaPca$x[!rownames(tcgaPca$x) %in% w,], pch=ifelse(is.na(tcgaCovariates[!rownames(tcgaPca$x) %in% w,geneId]),1,19), cex=0.5, col="grey", lwd=0.5)
 	points(tcgaPca$x[w,], pch=16, cex=0.85, col=col1[groups[i+1]], lwd=0.05)
 	u <- matrix(par("usr"), ncol=2)
 	if(i==1){
 		arrows(u[1,1],u[1,2], u[2,1],u[1,2],length=0.02)
 		arrows(u[1,1],u[1,2], u[1,1],u[2,2],length=0.02)
 	}
 	text(u[2],u[3] + (u[4]-u[3])*.9, labels=paste("n=",length(w), sep=""), bty="n", cex=1.2, pos=2)
 }
 plot.new()
 legend("center",c("Missing","Wildtype","Mutant"), pch=c(1,19,19), col=c("grey","grey","black"), bty="n", pt.cex=c(0.5,0.5,rep(0.85,4)),cex=1.2, pt.lwd=0.5)


 #' ### 1.2 Explained variance R2/F-statistic
 #' Here we want to determine all genes which are associated with any covariate. This will be based on an F-statistic.
 #' lmFit also tests wether the offset is different from zero (trivially true).
 #' Create design matrix of covariates; impute missing values by mean. 
 poorMansImpute <- function(x) {x[is.na(x)] <- mean(x, na.rm=TRUE); return(x)}
 tcgaDesign <- apply(tcgaCovariates,2,poorMansImpute)

 #' #### Fit the linear model
 tcgaGlm = lmFit(as.matrix(tcgaExpr), design = tcgaDesign) 
 tcgaGlm = eBayes(tcgaGlm)

 #' #### Random model
 #' Compare to a model where all values of the covariates are randomly permuted. This helps reassure FDR estimates.
 set.seed(42)
 tcgaRlm <- lmFit(tcgaExpr[,rownames(tcgaDesign)], apply(tcgaDesign, 2, sample))
 tcgaRlm <- eBayes(tcgaRlm)
 F.stat <- classifyTestsF(tcgaRlm[,-1],fstat.only=TRUE)
 tcgaRlm$F <- as.vector(F.stat)
 df1 <- attr(F.stat,"df1")
 df2 <- attr(F.stat,"df2")
 if(df2[1] > 1e6){ # Work around bug in R 2.1
 	tcgaRlm$F.p.value <- pchisq(df1*tcgaRlm$F,df1,lower.tail=FALSE)
 }else
 	tcgaRlm$F.p.value <- pf(tcgaRlm$F,df1,df2,lower.tail=FALSE)

 #' #### Explained variance by different categories
 #' The F-statistic is directly related to the R2.
 F.stat <- classifyTestsF(tcgaGlm[,2:22],fstat.only=TRUE) ## All genetics & cytogenetics
 df1 <- attr(F.stat,"df1")
 df2 <- attr(F.stat,"df2")
 F.p.value <- pchisq(df1*F.stat,df1,lower.tail=FALSE)

 R.stat <- classifyTestsF(tcgaRlm[,2:22],fstat.only=TRUE) ## Random

 Rall = 1 - 1/(1 + tcgaGlm$F * (ncol(tcgaDesign)-1)/(nrow(tcgaDesign)-ncol(tcgaDesign)))
 Rgenetics = 1 - 1/(1 + F.stat * 21/(nrow(tcgaDesign)-ncol(tcgaDesign)))
 Pgenetics = 1 - 1/(1 + R.stat * 21/(nrow(tcgaDesign)-ncol(tcgaDesign)))
 names(Rgenetics) <- names(Pgenetics) <- names(Rall) <-  rownames(tcgaExpr)

 #' #### Plot the variance explained by genetics
 #+ TCGA-GE-R2, fig.width=2, fig.height=1.8
 par(bty="n", mgp = c(2,.33,0), mar=c(3,2.5,1,1)+.1, las=1, tcl=-.25, xpd=NA)
 d <- density(Pgenetics,bw=1e-3)
 f <- 1#nrow(gexpr)/512
 plot(d$x, d$y * f, col='grey', xlab=expression(paste("Explained variance per gene ", R^2)), main="", lwd=2, type="l", ylab="", xlim=c(0,0.7))
 title(ylab="Density", line=1.5)
 d <- density(Rgenetics, bw=1e-3)
 r <- min(Rgenetics[p.adjust(F.p.value,"BH")<0.05])
 x0 <- which(d$x>r)
 polygon(d$x[c(x0[1],x0)], c(0,d$y[x0])* f, col=paste(set1[1],"44",sep=""), border=NA)
 lines(d$x, d$y* f, col=set1[1], lwd=2)
 #points(d$x[x0[1]], d$y[x0[1]]*f, col=set1[1], pch=16)
 text(d$x[x0[1]], d$y[x0[1]]*f, pos=4, paste(sum(Rgenetics > r), "genes q < 0.05"))
 legend("topright", bty="n", col=c(set1[1], "grey"), lty=1, c("Observed","Random"), lwd=2)

 #' #### Predictions
 glmPrediction <- tcgaGlm$coefficients %*% t(tcgaDesign)
 rlmPrediction <- tcgaRlm$coefficients %*% t(tcgaDesign)

 #' ### 1.4. Test results
 #' Prepare the test results using a hierarchical procedure: 
 #' 1. Adjust down transcripts using F-stat
 #' 2. Adjust along covariates
 #+ TCGA-testResults
 testResults <- decideTests(tcgaGlm, method="hierarchical",adjust.method="BH", p.value=0.05)[,-1]
 significantGenes <- sapply(1:ncol(testResults), function(j){
 			c <- tcgaGlm$coefficients[testResults[,j]!=0,j+1]
 			table(cut(c, breaks=c(-5,seq(-1.5,1.5,l=7),5)))
 		})
 colnames(significantGenes) <- colnames(testResults)


 #' #### Table of top 50 deregulated genes contains many HOX genes
 #+ TCGA-top50, results='asis'
 t <- head(sort(Rgenetics, d=TRUE), 50)
 g <- apply(testResults[names(t),], 1, function(x) paste(colnames(testResults)[x!=0], collapse=", "))
 print(xtable(cbind(AnnotationDbi::select(org.Hs.eg.db, names(t), c("SYMBOL","GENENAME"), "ENTREZID"),`R^2`=t, `Mutations`=g)), type="html", )

 #' #### Top genes
 #+ TCGA-GE-predict, fig.width=2, fig.height=1.8
 par(bty="n", mgp = c(1.5,.33,0), mar=c(2.5,2.5,1,1)+.1, las=1, tcl=-.25)
 for(w in names(head(sort(Rgenetics, decreasing=TRUE),4))){
 gene <- AnnotationDbi::select(org.Hs.eg.db, w, "SYMBOL", "ENTREZID")$SYMBOL
 plot(glmPrediction[w,], tcgaExpr[w,rownames(tcgaDesign)], ylab=parse(text=paste("Observed  ~ italic(",gene,") ~ expression")), xlab=parse(text=paste("Predicted  ~ italic(",gene,") ~ expression")), pch=16, cex=.8)
 par(xpd=FALSE)
 abline(0,1)
 u <- par("usr")
 par(xpd=NA)
 y <- tcgaGlm$coefficients[w,-1]+tcgaGlm$coefficients[w,1]
 u <- par("usr")
 x0 <- rep(u[4]-(u[4]-u[3])/8,ncol(tcgaDesign)-1)
 y0 <- u[4] + 0.05*(u[4]-u[3]) - rank(-y)/length(y) * (u[4]-u[3])/1.2
 d <- density(y)
 lines(d$x, d$y/5+u[4]-(u[4]-u[3])/8, col="grey")
 lines(d$x, -d$y/5+u[4]-(u[4]-u[3])/8, col="grey")
 points(x=y, y=x0+violinJitter(y, magnitude=0.25)$y,pch=19, col=col1[groups[-1]])
 text(x=tcgaGlm$coefficients[w,1], y= u[4], "Model coefficients (logFC)", cex=0.8)
 v <- tcgaGlm$p.value[w,-1] < 0.01
 rotatedLabel(y[v], x0[v]+0.1, labels=colnames(tcgaDesign)[-1][v], font=ifelse(grepl("[[:lower:]]", colnames(tcgaDesign)[-1]),1,3)[v], cex=.66, pos=1)
 axis(at=-1:1 + tcgaGlm$coefficients[w,1], labels=-1:1, side=3, cex.axis=.8, line=-1, mgp = c(1.5,.05,0), tcl=-.15)
 text(u[2],u[3] + (u[4]-u[3])/10, substitute(paste(R^2==r),list(r=round(Rgenetics[w],2))), pos=2)
 }


 #' #### Barplot
 #+ TCGA-GE-quantiles,  fig.width=4, fig.height=2
 par(bty="n", mgp = c(2.5,.33,0), mar=c(3.5,3.3,2,0)+.1, las=2, tcl=-.25)
 b <- barplot(significantGenes, las=2, ylab = "Differentially expressed genes", col=brewer.pal(8,"RdYlBu"), legend.text=FALSE , border=0, xaxt="n")#, col = set1[simple.annot[names(n)]], border=NA)
 rotatedLabel(x0=b, y0=rep(10, ncol(significantGenes)), labels=colnames(significantGenes), cex=.7, srt=45, font=ifelse(grepl("[[:lower:]]", colnames(tcgaDesign))[-1], 1,3), col=col1[groups[-1]])
 #text(b+0.2, colSums(n)+50, colSums(n), pos=3, cex=.7, srt=90)
 x0 <-par("usr")[1] + 0.05 * (par("usr")[2]-par("usr")[1])
 dy <- 0.1*(par("usr")[4] - par("usr")[3])
 y0 <- par("usr")[4] - 1.5*dy
 image(x=x0+c(0,0.8), y=y0+seq(-dy,dy,l=9), z=matrix(1:8, ncol=8), col=brewer.pal(8,"RdYlBu"), add=TRUE)
 text(x=x0+1.5, y=y0+seq(-dy/2,dy/2,l=3), format(seq(-1,1,l=3),2), cex=0.66)
 lines(x=rep(x0+.8,2), y=y0+c(-dy*.75,dy*.75))
 segments(x0+.8,y0+seq(-dy*.75,dy*.75,l=7),x0+.9,y0+seq(-dy*.75,dy*.75,l=7))
 text(x0+.8, y0+dy*1.1, "log2 FC", cex=.66)
 rotatedLabel(b-0.1, colSums(significantGenes), colSums(significantGenes), pos=3, cex=, srt=45)

 #' #### Associated mutations per transcript:
 #+ TCGA-GE-transcripts,  fig.width=2, fig.height=2
 par(bty="n", mgp = c(2.5,.33,0), mar=c(3,3.3,3,0)+.1, las=1, tcl=-.25)
 t <- table(rowSums(abs(testResults[,1:16])))
 b <- barplot(t[-1],ylab="Differentially expressed genes", col=rev(brewer.pal(7, "Spectral")[-(4:5)]), border=NA)
 rotatedLabel(b-0.1, t[-1], t[-1], pos=3, cex=1, srt=45)
 title(xlab="Associated drivers", line=2)


 #' #### Chromosomal distribution
 chr = factor(sapply(AnnotationDbi::mget(rownames(tcgaExpr), org.Hs.egCHR, ifnotfound=NA), `[`,1), levels=c(1:22, "X","Y","MT"))
 chromTable <- apply(testResults,2, function(x) table(chr[x!=0]))

 #+ TCGA-GE-CHR,  fig.width=6, fig.height=6
 par(bty="n", mgp = c(0.5,0.5,0), las=1, tcl=-.25, font.main=3, mfrow=c(6,6), xpd=NA, mar=c(0,0,1.5,0))
 for(j in 1:ncol(testResults)){
 	n <- sum(testResults[,j]!=0)
 	pie(chromTable[,j], col=colorRampPalette(brewer.pal(11,'Spectral'))(24), border="white",  radius=0.8, init.angle=90, labels=ifelse(chromTable[,j]/sum(chromTable[,j]) > 0.02, paste("",rownames(chromTable), "(" ,chromTable[,j], ")",sep=""),""))
 	title(main = colnames(chromTable)[j], font.main = ifelse(grepl("[[:lower:]]", colnames(chromTable)[j]), 1,3), cex.main=1.33)
 	symbols(0,0,circles=.3, inches=FALSE, col="white", bg="white", lty=0, add=TRUE)
 	#symbols(0,0,circles=.8*(1-sqrt(n/max(colSums(testResults!=0)))), col="white", add=TRUE, lty=0, bg="white", inches=FALSE)
 	#cat(n,"\n")
 }
 t <- table(chr)
 pie(t, col=colorRampPalette(brewer.pal(11,'Spectral'))(24), border="white",  radius=0.8, cex.main=1.33, labels=ifelse(t/sum(t) > 0.02, names(t),""), init.angle=90)
 symbols(0,0,circles=.3, inches=FALSE, col="white", bg="white", lty=0, add=TRUE)
 title(main = "# Genes", font.main = ifelse(grepl("[[:lower:]]", colnames(chromTable)[j]), 1,3))




 #' 2. Survival analyses
 #' -----------------
 #' ### 2.1 Prepare data
 library(CoxHD)
 tcgaData <- data.frame(tcgaMutation, tcgaClinical[14:24])
 tcgaData <- tcgaData[colSums(tcgaData, na.rm=TRUE)>=5]
 tcgaData <- ImputeXMissing(data.frame(tcgaData, Gender=tcgaClinical$Gender, scale(tcgaClinical[,c(7:12)])))
 rownames(tcgaData) <- rownames(tcgaMutation)
 tcgaData <- tcgaData[rowSums(is.na(tcgaData))==0 & rownames(tcgaData) %in% colnames(tcgaExpr),]
 dataFrame <- data.frame(tcgaData, scale(tcgaPca$x[rownames(tcgaData),1:20]))
 survivalGroups <- rep("Blood", ncol(dataFrame))
 survivalGroups[colnames(tcgaData)%in%colnames(tcgaMutation)] <- "Genetics"
 survivalGroups[grep("^PC", colnames(dataFrame))] <- "Expression"
 survivalGroups[grep("^[a-z]", colnames(dataFrame))] <- "CNA"
 survivalGroups[grep("(t_)|_t", colnames(dataFrame))] <- "Translocations"
 survivalGroups[grep("(Gender)|(AOD)", colnames(dataFrame))] <- "Demographics"
 survivalGroups <- factor(survivalGroups, levels=c("Genetics","CNA","Translocations","Expression","Demographics","Blood"))
 survivalCol=set1[c(3,2,5,4,7,1)]#  c(brewer.pal(8, "Dark2")[1:3], brewer.pal(8, "Set1")[2:1])
 names(survivalCol) <- levels(survivalGroups)

 library(survival)
 tcgaSurvival <- Surv(tcgaClinical$OS + .5, tcgaClinical$Status)[match(rownames(tcgaData),tcgaClinical$TCGA_ID)]

 #' ### 2.2 Fit model and compute variance components of the predicted log-hazard
 #+ TCGA-varianceComponents,  fig.width=3.5, fig.height=2
 coxRFX <- CoxRFX(dataFrame, tcgaSurvival, which.mu = NULL) ## Further developed version of ecoxph
 VarianceComponents(coxRFX, groups=survivalGroups)
 PlotVarianceComponents(coxRFX, col=survivalCol, groups=survivalGroups)
 points(0,0,pch=16, col="white", cex=15)
 title(main="Variance components AML")

 #' #### Stepwise model selection
 #+ TCGA-CoxBIC
 c <- coxph(tcgaSurvival ~ 1, data=dataFrame)
 scopeStep <- as.formula(paste("tcgaSurvival ~", paste(colnames(dataFrame), collapse="+")))
 coxBIC <- step(c, scope=scopeStep, k = log(sum(!is.na(tcgaSurvival))), trace=0)
 summary(coxBIC)

 #' #### Random forests
 #+ TCGA-RSF, cache=TRUE
 library(randomForestSRC)
 m <- match(rownames(tcgaPca$x),tcgaClinical$TCGA_ID)
 rsf <- rfsrc(Surv(time, status) ~ ., data=data.frame(time=tcgaClinical$OS[m], status = tcgaClinical$Status[m], dataFrame ), ntree=100)

 #' Variable importance
 #+ TCGA-RSF-boxplot,  fig.width=1.5, fig.height=2
 par(bty="n", mgp = c(2,.33,0), mar=c(4,3,1,0.5)+.1, las=2, tcl=-.25, las=3, xpd=NA)
 boxplot(rsf$importance ~ survivalGroups, border= survivalCol, staplewex=0, pch=16, cex=0.75, ylab="Variable importance", lty=1, xaxt="n")
 rotatedLabel(x0=1:nlevels(survivalGroups), y0=rep(-0.002,nlevels(survivalGroups)), labels=levels(survivalGroups), srt=45)
 #' The plot confirms the result that expression, blood counts, and clinical variables are most influental.

 #' #### Subset
 set.seed(42)
 cvIdx <- sample(1:5, nrow(dataFrame), replace=TRUE)

 #+ TCGA-concordance, cache=TRUE, warning=FALSE, fig.height=2, fig.width=2.5
 subsets <- list(Genetics="Genetics", Cytogenetics=c("Translocations","CNA"), Blood="Blood", Demographics="Demographics", Expression="Expression", `Gen+Cyt` = c("Genetics","Translocations","CNA"), `Gen+Cyt+Blo+Exp`=c("Genetics","Translocations","CNA","Blood","Expression"), All=unique(survivalGroups))
 colSubsets <- set1
 names(colSubsets) <- c("Blood","Cytogenetics", "Genetics","Expression","Gen+Cyt","Gen+Cyt+Blo+Exp","Demographics","All","Std. Risk")
 concordance <- sapply(1:5, function(i){
 			v <- cvIdx == i
 			c(sapply(subsets, function(s,v){
 						w <- survivalGroups %in% s
 						fit <- CoxRFX(dataFrame[!v, w]+0, tcgaSurvival[!v], which.mu=NULL)
 						p <- as.matrix(dataFrame[v,w]) %*% coef(fit)
 						survConcordance(tcgaSurvival[v]~p)$concordance
 					}, v=v), 
 			Std.Risk =  survConcordance(tcgaSurvival[v]~ c(3,1,2)[tcgaClinical$C_Risk[match(rownames(tcgaData),tcgaClinical$TCGA_ID)][v]])$concordance
 			)
 		})
 rownames(concordance) <- c(names(subsets), "Std. Risk")
 par(mar=c(5,4,1,1), mgp=c(3,0.5,0))
 m <- rowMeans(concordance, na.rm=TRUE)
 e <- apply(concordance,1,var)/ncol(concordance)
 o <- order(m)
 barplot(m[o], col=colSubsets[names(m[o])], names.arg=rep("", nrow(concordance)), ylim=c(0.5,0.75), xpd=FALSE, ylab="Concordance (5x CV)") -> b
 points(jitter(rep(b,5)), concordance[o,], col="grey", pch=16, cex=.5)
 rotatedLabel(b,rep(0.49, nrow(concordance)), rownames(concordance)[o])
 segments(b, m[o]+sqrt(e)[o], b , m[o]-sqrt(e)[o])

 #' #### PC predictions
 #' The predicted PCs have more predictive power than pure genomics
 #+ TCGA-predictedPC, fig.width=1.25, fig.height=2.5, cache=TRUE
 par(mar=c(10,4,1,1))
 pcPrediction <- tcgaDesign[,2:31] %*% t(tcgaGlm$coefficients[,2:31]) %*% tcgaPca$rotation ## Genomics only, no offset
 set.seed(42)
 concordancePrediction <- sapply(1:100, function(i){
 			v <- sample(1:nrow(dataFrame) %% 5 + 1) == 1 ## 80:20 split for cross validation
 			fit <- CoxRFX(pcPrediction[!v, 1:20], tcgaSurvival[!v], which.mu=NULL)
 			p <- as.matrix(pcPrediction[v, 1:20]) %*% coef(fit)
 			pcp <- survConcordance(tcgaSurvival[v]~p)$concordance[1]
 			fit <- CoxRFX(tcgaPca$x[!v, 1:20], tcgaSurvival[!v], which.mu=NULL)
 			p <- as.matrix(tcgaPca$x[v, 1:20]) %*% coef(fit)
 			pc <- survConcordance(tcgaSurvival[v]~p)$concordance[1]
 			fit <- CoxRFX(dataFrame[!v,survivalGroups %in% c("Genetics","Cytogenetics")], tcgaSurvival[!v], which.mu=NULL)
 			p <- as.matrix(dataFrame[v, survivalGroups %in% c("Genetics","Cytogenetics")]) %*% coef(fit)
 			gen <- survConcordance(tcgaSurvival[v]~p)$concordance[1]
 			c(Expression=pc, Genomics=gen, `Expression (predicted)`=pcp)
 		})
 c <- set1[c(4,2,4)]
 boxplot(t(concordancePrediction), notch=TRUE, ylab="Concordance", names=NA, lty=1, staplewex=0, pch=16, xaxt="n", at=1:3, border=c, col=c(NA,NA, brewer.pal(4,"Paired")[1]))
 u <- par("usr")
 rotatedLabel(1:3, rep(u[3] ,3), c("Expression","Genomics","Predicted expression"))

 #' Session
 #' -----
 #+ sessionInfo, eval=TRUE
 sessionInfo()
	#' Supplementary Data 4
	#' =======================
	#' ### Supplementary code and figures accompanying Interconnections among mutations, gene expression, clinical variables and patient outcome in myelodysplastic syndromes
	#'
	#' This document contains the complete code used in the TCGA AML analysis. It is purely written in `R` using a series of `R` and `Bioconductor` packages.
	#' This report has been generated using the `knitr` R package (http://yihui.name/knitr/).
	#' For a complete list of packages and their versions please have a look at the end of this document.

	#+ Preliminaries, echo=FALSE
	options(width=120)
	pdf.options(pointsize=8)
	knit_hooks$set(smallMar = function(before, options, envir) {
	if (before) par(mar=c(3,3,1,1), bty="n", mgp=c(2,.5,0))
	})
	opts_chunk$set(dev=c('my_png','pdf'), fig.ext=c('png','pdf'), fig.width=3, fig.height=3, smallMar=TRUE)
	my_png <- function(file, width, height, pointsize=12, ...) {
	png(file, width = 1.5width, height = 1.5height, units="in", res=72*1.5, pointsize=pointsize, ...)
	}
	#opts_knit$set(root.dir = file.path(getwd(),".."))

	#' TCGA AML expression data analysis
	#' ===========================

	#' #### Libraries
	library(limma)
	library(org.Hs.eg.db)
	library(hgu133plus2.db )
	library(RColorBrewer)
	set1 <- brewer.pal(9,"Set1")
	library(cgdsr)
	library(CoxHD) ## From github.com/mg14/CoxHD
	library(mg14) ## From github.com/mg14/mg14
	library(xtable)
	library(Hmisc)

	#' 1. Differential expression analysis
	#' ----------------------
	#'
	#' ### 1.1 Load data
	#' #### Get entrez ids for genes of interest, take all ids from the hg133plus array to have comparable data.
	entrez <- unique(AnnotationDbi::select(hgu133plus2.db, keys = keys(hgu133plus2.db), columns = c("ENTREZID"))$ENTREZID)

	#' #### Load expression data from cBio portal
	#+ cBioData, cache=TRUE
	mycgds <- CGDS("http://www.cbioportal.org/public-portal/")
	tcgaAML <- getCancerStudies(mycgds)[1,1]
	cases <- getCaseLists(mycgds,tcgaAML)[8,1]
	g <- lapply(split(as.numeric(entrez), seq_along(entrez)%/%500), function(genes) getProfileData(mycgds,genes,getGeneticProfiles(mycgds,tcgaAML)[2,1],cases)) ## load in batches of 500
	g <- do.call("cbind", g)
	tcgaExpr <- log(t(g)+.5)
	rownames(tcgaExpr) <- sub("\\.","-",sub("^[0-9]+\\.","",rownames(tcgaExpr))) ## Fix rownames
	#rm(e,s,g)

	#' The query will have dropped a few genes. Match back to entrezids...
	s <- AnnotationDbi::select(org.Hs.eg.db, rownames(tcgaExpr), "ENTREZID", "SYMBOL")
	m <- match(rownames(tcgaExpr), s$SYMBOL)
	sum(is.na(m))
	rownames(tcgaExpr) <- s$ENTREZID[m]
	colnames(tcgaExpr) <- gsub("\\.","-",colnames(tcgaExpr))
	tcgaExpr <- tcgaExpr[rowSums(is.na(tcgaExpr))==0 & !is.na(rownames(tcgaExpr)),]

	#' Check the log2 FPKM densities across samples
	#+ TCGA-logFPKM, fig.width=2, fig.height=1.8
	plot(density(tcgaExpr[,1]))
	for(j in 1:ncol(tcgaExpr)) lines(density(tcgaExpr[,j]))

	#' #### Load curated clinical and mutation data
	tcgaClinical <- read.table("SuppTableS3_Clinical.txt", sep="\t", header=TRUE, comment.char = "#") ## tab-delimietd version of Supplementary Table S3, Clinical sheet.
	colnames(tcgaClinical)[3:12] <- capitalize(colnames(tcgaClinical)[3:12])
	rownames(tcgaClinical) <- as.character(tcgaClinical$TCGA_ID)
	tcgaGenetic <- read.table("SuppTableS3_Genetic.txt", sep="\t", header=TRUE, comment.char = "#") ## tab-delimietd version of Supplementary Table S3, Genetics sheet.
	tcgaGenetic$TCGA_ID <- factor(as.character(tcgaGenetic$TCGA_ID), levels = levels(tcgaClinical$TCGA_ID))
	g <- as.character(tcgaGenetic$Hugo_Symbol)
	g[tcgaGenetic$Hugo_Symbol=="FLT3" & tcgaGenetic$Variant_Type == 'INS'] <- "FLT3_ITD"
	g[tcgaGenetic$Hugo_Symbol=="FLT3" & tcgaGenetic$Variant_Type == 'SNP'] <- "FLT3_TKD"
	tcgaMutation <- (table(tcgaGenetic$TCGA_ID,g)) > 0

	#' ### 1.2 Compute PCA
	tcgaPca <- prcomp(t(tcgaExpr))

	#' #### Covariates
	t <- cbind(tcgaMutation +0, tcgaClinical[,14:24])
	t <- t[,colSums(t[colnames(tcgaExpr),],na.rm=TRUE)>=5]
	tcgaCovariates <- as.matrix(cbind(Offset=1,t, Gender=tcgaClinical$Gender, Age=tcgaClinical$AOD/10))[colnames(tcgaExpr),]
	groups <- factor(c("Offset", rep("Genetics",22), "Translocations",rep("CNA", 5), rep("Translocations",2), rep("Demographics",2)), levels=c("Offset","Genetics","CNA","Translocations","Demographics"))
	tcgaCovariates <- tcgaCovariates[,order(groups)]
	groups <- groups[order(groups)]
	col1 <- c("grey",set1[c(3,5,2,7)])
	names(col1) <- levels(groups)


	#' #### PCA overview
	#' Now plot a large panel with small multiples of the first two PCs overlaid with the mutation status of each gene.
	#+ TCGA-PCA-panel, fig.width=5, fig.height=4
	par(bty="n", mgp = c(0,0.5,0), mar=c(1,1,1.5,0)+.1, las=1, tcl=-.25, font.main=3, mfrow=c(6,6), xpd=NA)
	i<-0
	for (geneId in colnames(tcgaCovariates)[-1]){
	i<-i+1
	plot(tcgaPca$x[rownames(tcgaCovariates),], cex=0.5,
	pch=NA,
	xlab=ifelse(i==1,"PC1",""), ylab=ifelse(i==1,"PC2",""),main=geneId, font.main=ifelse(grepl("[[:lower:]]",geneId),1,3), cex.main=1.33, cex.lab=1.2,xaxt="n", yaxt="n", ylim=c(-65,65))
	if(geneId != "Age")
	w <- rownames(tcgaCovariates)[which(tcgaCovariates[,geneId] == 1)]
	else
	w <- rownames(tcgaCovariates)[which(tcgaCovariates[,geneId] > median(tcgaCovariates[,geneId], na.rm=TRUE))]
	points(tcgaPca$x[!rownames(tcgaPca$x) %in% w,], pch=ifelse(is.na(tcgaCovariates[!rownames(tcgaPca$x) %in% w,geneId]),1,19), cex=0.5, col="grey", lwd=0.5)
	points(tcgaPca$x[w,], pch=16, cex=0.85, col=col1[groups[i+1]], lwd=0.05)
	u <- matrix(par("usr"), ncol=2)
	if(i==1){
	arrows(u[1,1],u[1,2], u[2,1],u[1,2],length=0.02)
	arrows(u[1,1],u[1,2], u[1,1],u[2,2],length=0.02)
	}
	text(u[2],u[3] + (u[4]-u[3])*.9, labels=paste("n=",length(w), sep=""), bty="n", cex=1.2, pos=2)
	}
	plot.new()
	legend("center",c("Missing","Wildtype","Mutant"), pch=c(1,19,19), col=c("grey","grey","black"), bty="n", pt.cex=c(0.5,0.5,rep(0.85,4)),cex=1.2, pt.lwd=0.5)


	#' ### 1.2 Explained variance R2/F-statistic
	#' Here we want to determine all genes which are associated with any covariate. This will be based on an F-statistic.
	#' lmFit also tests wether the offset is different from zero (trivially true).
	#' Create design matrix of covariates; impute missing values by mean.
	poorMansImpute <- function(x) {x[is.na(x)] <- mean(x, na.rm=TRUE); return(x)}
	tcgaDesign <- apply(tcgaCovariates,2,poorMansImpute)

	#' #### Fit the linear model
	tcgaGlm = lmFit(as.matrix(tcgaExpr), design = tcgaDesign)
	tcgaGlm = eBayes(tcgaGlm)

	#' #### Random model
	#' Compare to a model where all values of the covariates are randomly permuted. This helps reassure FDR estimates.
	set.seed(42)
	tcgaRlm <- lmFit(tcgaExpr[,rownames(tcgaDesign)], apply(tcgaDesign, 2, sample))
	tcgaRlm <- eBayes(tcgaRlm)
	F.stat <- classifyTestsF(tcgaRlm[,-1],fstat.only=TRUE)
	tcgaRlm$F <- as.vector(F.stat)
	df1 <- attr(F.stat,"df1")
	df2 <- attr(F.stat,"df2")
	if(df2[1] > 1e6){ # Work around bug in R 2.1
	tcgaRlm$F.p.value <- pchisq(df1*tcgaRlm$F,df1,lower.tail=FALSE)
	}else
	tcgaRlm$F.p.value <- pf(tcgaRlm$F,df1,df2,lower.tail=FALSE)

	#' #### Explained variance by different categories
	#' The F-statistic is directly related to the R2.
	F.stat <- classifyTestsF(tcgaGlm[,2:22],fstat.only=TRUE) ## All genetics & cytogenetics
	df1 <- attr(F.stat,"df1")
	df2 <- attr(F.stat,"df2")
	F.p.value <- pchisq(df1*F.stat,df1,lower.tail=FALSE)

	R.stat <- classifyTestsF(tcgaRlm[,2:22],fstat.only=TRUE) ## Random

	Rall = 1 - 1/(1 + tcgaGlm$F * (ncol(tcgaDesign)-1)/(nrow(tcgaDesign)-ncol(tcgaDesign)))
	Rgenetics = 1 - 1/(1 + F.stat * 21/(nrow(tcgaDesign)-ncol(tcgaDesign)))
	Pgenetics = 1 - 1/(1 + R.stat * 21/(nrow(tcgaDesign)-ncol(tcgaDesign)))
	names(Rgenetics) <- names(Pgenetics) <- names(Rall) <- rownames(tcgaExpr)

	#' #### Plot the variance explained by genetics
	#+ TCGA-GE-R2, fig.width=2, fig.height=1.8
	par(bty="n", mgp = c(2,.33,0), mar=c(3,2.5,1,1)+.1, las=1, tcl=-.25, xpd=NA)
	d <- density(Pgenetics,bw=1e-3)
	f <- 1#nrow(gexpr)/512
	plot(d$x, d$y * f, col='grey', xlab=expression(paste("Explained variance per gene ", R^2)), main="", lwd=2, type="l", ylab="", xlim=c(0,0.7))
	title(ylab="Density", line=1.5)
	d <- density(Rgenetics, bw=1e-3)
	r <- min(Rgenetics[p.adjust(F.p.value,"BH")<0.05])
	x0 <- which(d$x>r)
	polygon(d$x[c(x0[1],x0)], c(0,d$y[x0])* f, col=paste(set1[1],"44",sep=""), border=NA)
	lines(d$x, d$y* f, col=set1[1], lwd=2)
	#points(d$x[x0[1]], d$y[x0[1]]*f, col=set1[1], pch=16)
	text(d$x[x0[1]], d$y[x0[1]]*f, pos=4, paste(sum(Rgenetics > r), "genes q < 0.05"))
	legend("topright", bty="n", col=c(set1[1], "grey"), lty=1, c("Observed","Random"), lwd=2)

	#' #### Predictions
	glmPrediction <- tcgaGlm$coefficients %*% t(tcgaDesign)
	rlmPrediction <- tcgaRlm$coefficients %*% t(tcgaDesign)

	#' ### 1.4. Test results
	#' Prepare the test results using a hierarchical procedure:
	#' 1. Adjust down transcripts using F-stat
	#' 2. Adjust along covariates
	#+ TCGA-testResults
	testResults <- decideTests(tcgaGlm, method="hierarchical",adjust.method="BH", p.value=0.05)[,-1]
	significantGenes <- sapply(1:ncol(testResults), function(j){
	c <- tcgaGlm$coefficients[testResults[,j]!=0,j+1]
	table(cut(c, breaks=c(-5,seq(-1.5,1.5,l=7),5)))
	})
	colnames(significantGenes) <- colnames(testResults)


	#' #### Table of top 50 deregulated genes contains many HOX genes
	#+ TCGA-top50, results='asis'
	t <- head(sort(Rgenetics, d=TRUE), 50)
	g <- apply(testResults[names(t),], 1, function(x) paste(colnames(testResults)[x!=0], collapse=", "))
	print(xtable(cbind(AnnotationDbi::select(org.Hs.eg.db, names(t), c("SYMBOL","GENENAME"), "ENTREZID"),`R^2`=t, `Mutations`=g)), type="html", )

	#' #### Top genes
	#+ TCGA-GE-predict, fig.width=2, fig.height=1.8
	par(bty="n", mgp = c(1.5,.33,0), mar=c(2.5,2.5,1,1)+.1, las=1, tcl=-.25)
	for(w in names(head(sort(Rgenetics, decreasing=TRUE),4))){
	gene <- AnnotationDbi::select(org.Hs.eg.db, w, "SYMBOL", "ENTREZID")$SYMBOL
	plot(glmPrediction[w,], tcgaExpr[w,rownames(tcgaDesign)], ylab=parse(text=paste("Observed ~ italic(",gene,") ~ expression")), xlab=parse(text=paste("Predicted ~ italic(",gene,") ~ expression")), pch=16, cex=.8)
	par(xpd=FALSE)
	abline(0,1)
	u <- par("usr")
	par(xpd=NA)
	y <- tcgaGlm$coefficients[w,-1]+tcgaGlm$coefficients[w,1]
	u <- par("usr")
	x0 <- rep(u[4]-(u[4]-u[3])/8,ncol(tcgaDesign)-1)
	y0 <- u[4] + 0.05(u[4]-u[3]) - rank(-y)/length(y) (u[4]-u[3])/1.2
	d <- density(y)
	lines(d$x, d$y/5+u[4]-(u[4]-u[3])/8, col="grey")
	lines(d$x, -d$y/5+u[4]-(u[4]-u[3])/8, col="grey")
	points(x=y, y=x0+violinJitter(y, magnitude=0.25)$y,pch=19, col=col1[groups[-1]])
	text(x=tcgaGlm$coefficients[w,1], y= u[4], "Model coefficients (logFC)", cex=0.8)
	v <- tcgaGlm$p.value[w,-1] < 0.01
	rotatedLabel(y[v], x0[v]+0.1, labels=colnames(tcgaDesign)[-1][v], font=ifelse(grepl("[[:lower:]]", colnames(tcgaDesign)[-1]),1,3)[v], cex=.66, pos=1)
	axis(at=-1:1 + tcgaGlm$coefficients[w,1], labels=-1:1, side=3, cex.axis=.8, line=-1, mgp = c(1.5,.05,0), tcl=-.15)
	text(u[2],u[3] + (u[4]-u[3])/10, substitute(paste(R^2==r),list(r=round(Rgenetics[w],2))), pos=2)
	}


	#' #### Barplot
	#+ TCGA-GE-quantiles, fig.width=4, fig.height=2
	par(bty="n", mgp = c(2.5,.33,0), mar=c(3.5,3.3,2,0)+.1, las=2, tcl=-.25)
	b <- barplot(significantGenes, las=2, ylab = "Differentially expressed genes", col=brewer.pal(8,"RdYlBu"), legend.text=FALSE , border=0, xaxt="n")#, col = set1[simple.annot[names(n)]], border=NA)
	rotatedLabel(x0=b, y0=rep(10, ncol(significantGenes)), labels=colnames(significantGenes), cex=.7, srt=45, font=ifelse(grepl("[[:lower:]]", colnames(tcgaDesign))[-1], 1,3), col=col1[groups[-1]])
	#text(b+0.2, colSums(n)+50, colSums(n), pos=3, cex=.7, srt=90)
	x0 <-par("usr")[1] + 0.05 * (par("usr")[2]-par("usr")[1])
	dy <- 0.1*(par("usr")[4] - par("usr")[3])
	y0 <- par("usr")[4] - 1.5*dy
	image(x=x0+c(0,0.8), y=y0+seq(-dy,dy,l=9), z=matrix(1:8, ncol=8), col=brewer.pal(8,"RdYlBu"), add=TRUE)
	text(x=x0+1.5, y=y0+seq(-dy/2,dy/2,l=3), format(seq(-1,1,l=3),2), cex=0.66)
	lines(x=rep(x0+.8,2), y=y0+c(-dy.75,dy.75))
	segments(x0+.8,y0+seq(-dy.75,dy.75,l=7),x0+.9,y0+seq(-dy.75,dy.75,l=7))
	text(x0+.8, y0+dy*1.1, "log2 FC", cex=.66)
	rotatedLabel(b-0.1, colSums(significantGenes), colSums(significantGenes), pos=3, cex=, srt=45)

	#' #### Associated mutations per transcript:
	#+ TCGA-GE-transcripts, fig.width=2, fig.height=2
	par(bty="n", mgp = c(2.5,.33,0), mar=c(3,3.3,3,0)+.1, las=1, tcl=-.25)
	t <- table(rowSums(abs(testResults[,1:16])))
	b <- barplot(t[-1],ylab="Differentially expressed genes", col=rev(brewer.pal(7, "Spectral")[-(4:5)]), border=NA)
	rotatedLabel(b-0.1, t[-1], t[-1], pos=3, cex=1, srt=45)
	title(xlab="Associated drivers", line=2)


	#' #### Chromosomal distribution
	chr = factor(sapply(AnnotationDbi::mget(rownames(tcgaExpr), org.Hs.egCHR, ifnotfound=NA), `[`,1), levels=c(1:22, "X","Y","MT"))
	chromTable <- apply(testResults,2, function(x) table(chr[x!=0]))

	#+ TCGA-GE-CHR, fig.width=6, fig.height=6
	par(bty="n", mgp = c(0.5,0.5,0), las=1, tcl=-.25, font.main=3, mfrow=c(6,6), xpd=NA, mar=c(0,0,1.5,0))
	for(j in 1:ncol(testResults)){
	n <- sum(testResults[,j]!=0)
	pie(chromTable[,j], col=colorRampPalette(brewer.pal(11,'Spectral'))(24), border="white", radius=0.8, init.angle=90, labels=ifelse(chromTable[,j]/sum(chromTable[,j]) > 0.02, paste("",rownames(chromTable), "(" ,chromTable[,j], ")",sep=""),""))
	title(main = colnames(chromTable)[j], font.main = ifelse(grepl("[[:lower:]]", colnames(chromTable)[j]), 1,3), cex.main=1.33)
	symbols(0,0,circles=.3, inches=FALSE, col="white", bg="white", lty=0, add=TRUE)
	#symbols(0,0,circles=.8*(1-sqrt(n/max(colSums(testResults!=0)))), col="white", add=TRUE, lty=0, bg="white", inches=FALSE)
	#cat(n,"\n")
	}
	t <- table(chr)
	pie(t, col=colorRampPalette(brewer.pal(11,'Spectral'))(24), border="white", radius=0.8, cex.main=1.33, labels=ifelse(t/sum(t) > 0.02, names(t),""), init.angle=90)
	symbols(0,0,circles=.3, inches=FALSE, col="white", bg="white", lty=0, add=TRUE)
	title(main = "# Genes", font.main = ifelse(grepl("[[:lower:]]", colnames(chromTable)[j]), 1,3))




	#' 2. Survival analyses
	#' -----------------
	#' ### 2.1 Prepare data
	library(CoxHD)
	tcgaData <- data.frame(tcgaMutation, tcgaClinical[14:24])
	tcgaData <- tcgaData[colSums(tcgaData, na.rm=TRUE)>=5]
	tcgaData <- ImputeXMissing(data.frame(tcgaData, Gender=tcgaClinical$Gender, scale(tcgaClinical[,c(7:12)])))
	rownames(tcgaData) <- rownames(tcgaMutation)
	tcgaData <- tcgaData[rowSums(is.na(tcgaData))==0 & rownames(tcgaData) %in% colnames(tcgaExpr),]
	dataFrame <- data.frame(tcgaData, scale(tcgaPca$x[rownames(tcgaData),1:20]))
	survivalGroups <- rep("Blood", ncol(dataFrame))
	survivalGroups[colnames(tcgaData)%in%colnames(tcgaMutation)] <- "Genetics"
	survivalGroups[grep("^PC", colnames(dataFrame))] <- "Expression"
	survivalGroups[grep("^[a-z]", colnames(dataFrame))] <- "CNA"
	survivalGroups[grep("(t_)\|_t", colnames(dataFrame))] <- "Translocations"
	survivalGroups[grep("(Gender)\|(AOD)", colnames(dataFrame))] <- "Demographics"
	survivalGroups <- factor(survivalGroups, levels=c("Genetics","CNA","Translocations","Expression","Demographics","Blood"))
	survivalCol=set1[c(3,2,5,4,7,1)]# c(brewer.pal(8, "Dark2")[1:3], brewer.pal(8, "Set1")[2:1])
	names(survivalCol) <- levels(survivalGroups)

	library(survival)
	tcgaSurvival <- Surv(tcgaClinical$OS + .5, tcgaClinical$Status)[match(rownames(tcgaData),tcgaClinical$TCGA_ID)]

	#' ### 2.2 Fit model and compute variance components of the predicted log-hazard
	#+ TCGA-varianceComponents, fig.width=3.5, fig.height=2
	coxRFX <- CoxRFX(dataFrame, tcgaSurvival, which.mu = NULL) ## Further developed version of ecoxph
	VarianceComponents(coxRFX, groups=survivalGroups)
	PlotVarianceComponents(coxRFX, col=survivalCol, groups=survivalGroups)
	points(0,0,pch=16, col="white", cex=15)
	title(main="Variance components AML")

	#' #### Stepwise model selection
	#+ TCGA-CoxBIC
	c <- coxph(tcgaSurvival ~ 1, data=dataFrame)
	scopeStep <- as.formula(paste("tcgaSurvival ~", paste(colnames(dataFrame), collapse="+")))
	coxBIC <- step(c, scope=scopeStep, k = log(sum(!is.na(tcgaSurvival))), trace=0)
	summary(coxBIC)

	#' #### Random forests
	#+ TCGA-RSF, cache=TRUE
	library(randomForestSRC)
	m <- match(rownames(tcgaPca$x),tcgaClinical$TCGA_ID)
	rsf <- rfsrc(Surv(time, status) ~ ., data=data.frame(time=tcgaClinical$OS[m], status = tcgaClinical$Status[m], dataFrame ), ntree=100)

	#' Variable importance
	#+ TCGA-RSF-boxplot, fig.width=1.5, fig.height=2
	par(bty="n", mgp = c(2,.33,0), mar=c(4,3,1,0.5)+.1, las=2, tcl=-.25, las=3, xpd=NA)
	boxplot(rsf$importance ~ survivalGroups, border= survivalCol, staplewex=0, pch=16, cex=0.75, ylab="Variable importance", lty=1, xaxt="n")
	rotatedLabel(x0=1:nlevels(survivalGroups), y0=rep(-0.002,nlevels(survivalGroups)), labels=levels(survivalGroups), srt=45)
	#' The plot confirms the result that expression, blood counts, and clinical variables are most influental.

	#' #### Subset
	set.seed(42)
	cvIdx <- sample(1:5, nrow(dataFrame), replace=TRUE)

	#+ TCGA-concordance, cache=TRUE, warning=FALSE, fig.height=2, fig.width=2.5
	subsets <- list(Genetics="Genetics", Cytogenetics=c("Translocations","CNA"), Blood="Blood", Demographics="Demographics", Expression="Expression", `Gen+Cyt` = c("Genetics","Translocations","CNA"), `Gen+Cyt+Blo+Exp`=c("Genetics","Translocations","CNA","Blood","Expression"), All=unique(survivalGroups))
	colSubsets <- set1
	names(colSubsets) <- c("Blood","Cytogenetics", "Genetics","Expression","Gen+Cyt","Gen+Cyt+Blo+Exp","Demographics","All","Std. Risk")
	concordance <- sapply(1:5, function(i){
	v <- cvIdx == i
	c(sapply(subsets, function(s,v){
	w <- survivalGroups %in% s
	fit <- CoxRFX(dataFrame[!v, w]+0, tcgaSurvival[!v], which.mu=NULL)
	p <- as.matrix(dataFrame[v,w]) %*% coef(fit)
	survConcordance(tcgaSurvival[v]~p)$concordance
	}, v=v),
	Std.Risk = survConcordance(tcgaSurvival[v]~ c(3,1,2)[tcgaClinical$C_Risk[match(rownames(tcgaData),tcgaClinical$TCGA_ID)][v]])$concordance
	)
	})
	rownames(concordance) <- c(names(subsets), "Std. Risk")
	par(mar=c(5,4,1,1), mgp=c(3,0.5,0))
	m <- rowMeans(concordance, na.rm=TRUE)
	e <- apply(concordance,1,var)/ncol(concordance)
	o <- order(m)
	barplot(m[o], col=colSubsets[names(m[o])], names.arg=rep("", nrow(concordance)), ylim=c(0.5,0.75), xpd=FALSE, ylab="Concordance (5x CV)") -> b
	points(jitter(rep(b,5)), concordance[o,], col="grey", pch=16, cex=.5)
	rotatedLabel(b,rep(0.49, nrow(concordance)), rownames(concordance)[o])
	segments(b, m[o]+sqrt(e)[o], b , m[o]-sqrt(e)[o])

	#' #### PC predictions
	#' The predicted PCs have more predictive power than pure genomics
	#+ TCGA-predictedPC, fig.width=1.25, fig.height=2.5, cache=TRUE
	par(mar=c(10,4,1,1))
	pcPrediction <- tcgaDesign[,2:31] %% t(tcgaGlm$coefficients[,2:31]) %% tcgaPca$rotation ## Genomics only, no offset
	set.seed(42)
	concordancePrediction <- sapply(1:100, function(i){
	v <- sample(1:nrow(dataFrame) %% 5 + 1) == 1 ## 80:20 split for cross validation
	fit <- CoxRFX(pcPrediction[!v, 1:20], tcgaSurvival[!v], which.mu=NULL)
	p <- as.matrix(pcPrediction[v, 1:20]) %*% coef(fit)
	pcp <- survConcordance(tcgaSurvival[v]~p)$concordance[1]
	fit <- CoxRFX(tcgaPca$x[!v, 1:20], tcgaSurvival[!v], which.mu=NULL)
	p <- as.matrix(tcgaPca$x[v, 1:20]) %*% coef(fit)
	pc <- survConcordance(tcgaSurvival[v]~p)$concordance[1]
	fit <- CoxRFX(dataFrame[!v,survivalGroups %in% c("Genetics","Cytogenetics")], tcgaSurvival[!v], which.mu=NULL)
	p <- as.matrix(dataFrame[v, survivalGroups %in% c("Genetics","Cytogenetics")]) %*% coef(fit)
	gen <- survConcordance(tcgaSurvival[v]~p)$concordance[1]
	c(Expression=pc, Genomics=gen, `Expression (predicted)`=pcp)
	})
	c <- set1[c(4,2,4)]
	boxplot(t(concordancePrediction), notch=TRUE, ylab="Concordance", names=NA, lty=1, staplewex=0, pch=16, xaxt="n", at=1:3, border=c, col=c(NA,NA, brewer.pal(4,"Paired")[1]))
	u <- par("usr")
	rotatedLabel(1:3, rep(u[3] ,3), c("Expression","Genomics","Predicted expression"))

	#' Session
	#' -----
	#+ sessionInfo, eval=TRUE
	sessionInfo()