alyssafrazee · October 21, 2014 04:08
diff --git a/geuvadis_analysis.R b/geuvadis_analysis.R
 ## simple analysis code for GEUVADIS data
 ## AF Oct 2014

 library(ballgown) #biocLite
 library(RSkittleBrewer) #install_github
 library(RColorBrewer) #CRAN
 library(usefulstuff) #install_github
 library(RCurl) #CRAN

 load('fpkm.rda') # download at http://files.figshare.com/1625419/fpkm.rda
 expressed = exprfilter(fpkm, cutoff=1)
 expr = subset(expressed, 'UseThisDup == 1', genomesubset=FALSE)

 # add lab ID to pData:
 QCurl = getURL('https://raw.githubusercontent.com/alyssafrazee/ballgown_code/master/GEUVADIS_preprocessing/GD667.QCstats.masterfile.txt')
 qcstats = read.table(textConnection(QCurl), sep='\t', header=TRUE)
 pData(expr)$lab = qcstats$SeqLabNumber[match(pData(expr)$SampleID, rownames(qcstats))]

 y = log2(texpr(expr)+1)
 pca = prcomp(t(y))
 pcmat = pca$x
 # take the transpose because we want to reduce the *transcript* 
 # dimension (not the replicate dimension)
 # so we put the transcripts in columns)

 # make some plots to figure out what the important stuff is:
 cols = brewer.pal(5, 'Dark2')
 sid = ballgown:::ss(rownames(pcmat), pattern='\\.', slot=2)
 sum(sid != pData(expr)$dirname) #just make sure in same order

 n = nrow(pData(expr))
 popinds = split(1:n, pData(expr)$population)
 labinds = split(1:n, pData(expr)$lab)
 rinmed = median(pData(expr)$RIN)
 pData(expr)$hirin = ifelse(pData(expr)$RIN > rinmed, 'high', 'low')
 rininds = split(1:n, pData(expr)$hirin)

 plot(pcmat[,1], pcmat[,2], type='n', xlab='PC1', ylab='PC2')
 for(i in seq_along(rininds)){
    points(pcmat[rininds[[i]], 1], pcmat[rininds[[i]], 2], 
        pch=19, col=makeTransparent(cols[i]))
 }

 pdf('population_pc.pdf')
    plot(pcmat[,2], pcmat[,1], type='n', xlab='PC2', ylab='PC1')
    for(i in seq_along(popinds)){
        ii = popinds[[i]]
        points(pcmat[ii,2], pcmat[ii,1], pch=19, col=makeTransparent(cols[i]))
    }
    legend('topleft', names(popinds), col=cols, pch=19)
 dev.off()

 pdf('eur_yri_pc.pdf')
    plot(pcmat[,2], pcmat[,1], type='n', xlab='PC2', ylab='PC1')
    for(i in seq_along(popinds)){
        ii = popinds[[i]]
        thecolor = ifelse(i==5, 'red', 'black')
        points(pcmat[ii,2], pcmat[ii,1], pch=19, col=makeTransparent(thecolor))
    }
    legend('topleft', c('EUR', 'YRI'), col=c('black', 'red'), pch=19)
 dev.off()

 ## PC1 is seems to be associated with lab:
 labcolors = brewer.pal(7, 'Set1')
 labcolors[6] = 'black'
 labcolors = sample(labcolors) #so the red/pink labs aren't next to each other
 pdf('lab_effect.pdf')
    plot(pcmat[,2], pcmat[,1], type='n', xlab='PC2', ylab='PC1')
    for(i in seq_along(labinds)){
        points(pcmat[labinds[[i]], 2], pcmat[labinds[[i]], 1],
        pch=19, col=makeTransparent(labcolors[i]))
    } #zomg.
    legend('topleft', col=labcolors, pch=19, paste('lab', 1:7))
 dev.off()

 ## PC2 is correlated with RIN:
 cor(pcmat[,2], pData(expr)$RIN) #0.72
 (cor(pcmat[,2], pData(expr)$RIN))^2 #0.52
 pdf('rin_effect.pdf')
    plot(pcmat[,2], pcmat[,1], type='n', xlab='PC2', ylab='PC1')
    points(pcmat[pData(expr)$hirin=='high', 2], 
        pcmat[pData(expr)$hirin=='high', 1], 
        col=makeTransparent('dodgerblue'),
        pch=19)
    points(pcmat[pData(expr)$hirin=='low', 2], 
        pcmat[pData(expr)$hirin=='low', 1], 
        col=makeTransparent('orange'),
        pch=19)
    legend('topleft', col=c('dodgerblue', 'orange'), 
        c('high RNA quality', 'low RNA quality'), pch=19)
 dev.off()

 ## can see some population structure in PC2 vs. PC3:
 pdf('pc3.pdf')
    plot(pcmat[,2], pcmat[,3], type='n', xlab='PC2', ylab='PC3', ylim=c(-50, 45))
    for(i in seq_along(popinds)){
        ii = popinds[[i]]
        points(pcmat[ii,2], pcmat[ii,3], pch=19, col=makeTransparent(cols[i]))
    }
    legend('topleft', names(popinds), col=cols, pch=19)
 dev.off()

 ## uncolored plot
 pdf('nocolor.pdf')
    plot(pcmat[,2], pcmat[,1], xlab='PC2', ylab='PC1', col=makeTransparent('black'), pch=19)
 dev.off()

 ## percent variation explained:
 names(pca)
 pctvar = pca$sdev^2 / sum(pca$sdev^2)
 cumulative_pctvar = cumsum(pca$sdev^2) / sum(pca$sdev^2)
 pdf('pctvar.pdf')
    plot(cumulative_pctvar, pch=21, bg='gray80', xlab='Principal Component Index', ylab='Cumulative Percent Variance Explained')
 dev.off()

 pdf('pc3_ceu_gbr_yri.pdf')
    plot(pcmat[,2], pcmat[,3], type='n', xlab='PC2', ylab='PC3', ylim=c(-50, 45))
    for(i in seq_along(popinds)){
        ii = popinds[[i]]
        if(i %% 2 == 1){
            points(pcmat[ii,2], pcmat[ii,3], pch=19, col=makeTransparent(cols[i]))
        }
    }
    legend('topleft', names(popinds)[c(1,3,5)], col=cols[c(1,3,5)], pch=19)
 dev.off()
	## simple analysis code for GEUVADIS data
	## AF Oct 2014

	library(ballgown) #biocLite
	library(RSkittleBrewer) #install_github
	library(RColorBrewer) #CRAN
	library(usefulstuff) #install_github
	library(RCurl) #CRAN

	load('fpkm.rda') # download at http://files.figshare.com/1625419/fpkm.rda
	expressed = exprfilter(fpkm, cutoff=1)
	expr = subset(expressed, 'UseThisDup == 1', genomesubset=FALSE)

	# add lab ID to pData:
	QCurl = getURL('https://raw.githubusercontent.com/alyssafrazee/ballgown_code/master/GEUVADIS_preprocessing/GD667.QCstats.masterfile.txt')
	qcstats = read.table(textConnection(QCurl), sep='\t', header=TRUE)
	pData(expr)$lab = qcstats$SeqLabNumber[match(pData(expr)$SampleID, rownames(qcstats))]

	y = log2(texpr(expr)+1)
	pca = prcomp(t(y))
	pcmat = pca$x
	# take the transpose because we want to reduce the transcript
	# dimension (not the replicate dimension)
	# so we put the transcripts in columns)

	# make some plots to figure out what the important stuff is:
	cols = brewer.pal(5, 'Dark2')
	sid = ballgown:::ss(rownames(pcmat), pattern='\\.', slot=2)
	sum(sid != pData(expr)$dirname) #just make sure in same order

	n = nrow(pData(expr))
	popinds = split(1:n, pData(expr)$population)
	labinds = split(1:n, pData(expr)$lab)
	rinmed = median(pData(expr)$RIN)
	pData(expr)$hirin = ifelse(pData(expr)$RIN > rinmed, 'high', 'low')
	rininds = split(1:n, pData(expr)$hirin)

	plot(pcmat[,1], pcmat[,2], type='n', xlab='PC1', ylab='PC2')
	for(i in seq_along(rininds)){
	points(pcmat[rininds[[i]], 1], pcmat[rininds[[i]], 2],
	pch=19, col=makeTransparent(cols[i]))
	}

	pdf('population_pc.pdf')
	plot(pcmat[,2], pcmat[,1], type='n', xlab='PC2', ylab='PC1')
	for(i in seq_along(popinds)){
	ii = popinds[[i]]
	points(pcmat[ii,2], pcmat[ii,1], pch=19, col=makeTransparent(cols[i]))
	}
	legend('topleft', names(popinds), col=cols, pch=19)
	dev.off()

	pdf('eur_yri_pc.pdf')
	plot(pcmat[,2], pcmat[,1], type='n', xlab='PC2', ylab='PC1')
	for(i in seq_along(popinds)){
	ii = popinds[[i]]
	thecolor = ifelse(i==5, 'red', 'black')
	points(pcmat[ii,2], pcmat[ii,1], pch=19, col=makeTransparent(thecolor))
	}
	legend('topleft', c('EUR', 'YRI'), col=c('black', 'red'), pch=19)
	dev.off()

	## PC1 is seems to be associated with lab:
	labcolors = brewer.pal(7, 'Set1')
	labcolors[6] = 'black'
	labcolors = sample(labcolors) #so the red/pink labs aren't next to each other
	pdf('lab_effect.pdf')
	plot(pcmat[,2], pcmat[,1], type='n', xlab='PC2', ylab='PC1')
	for(i in seq_along(labinds)){
	points(pcmat[labinds[[i]], 2], pcmat[labinds[[i]], 1],
	pch=19, col=makeTransparent(labcolors[i]))
	} #zomg.
	legend('topleft', col=labcolors, pch=19, paste('lab', 1:7))
	dev.off()

	## PC2 is correlated with RIN:
	cor(pcmat[,2], pData(expr)$RIN) #0.72
	(cor(pcmat[,2], pData(expr)$RIN))^2 #0.52
	pdf('rin_effect.pdf')
	plot(pcmat[,2], pcmat[,1], type='n', xlab='PC2', ylab='PC1')
	points(pcmat[pData(expr)$hirin=='high', 2],
	pcmat[pData(expr)$hirin=='high', 1],
	col=makeTransparent('dodgerblue'),
	pch=19)
	points(pcmat[pData(expr)$hirin=='low', 2],
	pcmat[pData(expr)$hirin=='low', 1],
	col=makeTransparent('orange'),
	pch=19)
	legend('topleft', col=c('dodgerblue', 'orange'),
	c('high RNA quality', 'low RNA quality'), pch=19)
	dev.off()

	## can see some population structure in PC2 vs. PC3:
	pdf('pc3.pdf')
	plot(pcmat[,2], pcmat[,3], type='n', xlab='PC2', ylab='PC3', ylim=c(-50, 45))
	for(i in seq_along(popinds)){
	ii = popinds[[i]]
	points(pcmat[ii,2], pcmat[ii,3], pch=19, col=makeTransparent(cols[i]))
	}
	legend('topleft', names(popinds), col=cols, pch=19)
	dev.off()

	## uncolored plot
	pdf('nocolor.pdf')
	plot(pcmat[,2], pcmat[,1], xlab='PC2', ylab='PC1', col=makeTransparent('black'), pch=19)
	dev.off()

	## percent variation explained:
	names(pca)
	pctvar = pca$sdev^2 / sum(pca$sdev^2)
	cumulative_pctvar = cumsum(pca$sdev^2) / sum(pca$sdev^2)
	pdf('pctvar.pdf')
	plot(cumulative_pctvar, pch=21, bg='gray80', xlab='Principal Component Index', ylab='Cumulative Percent Variance Explained')
	dev.off()

	pdf('pc3_ceu_gbr_yri.pdf')
	plot(pcmat[,2], pcmat[,3], type='n', xlab='PC2', ylab='PC3', ylim=c(-50, 45))
	for(i in seq_along(popinds)){
	ii = popinds[[i]]
	if(i %% 2 == 1){
	points(pcmat[ii,2], pcmat[ii,3], pch=19, col=makeTransparent(cols[i]))
	}
	}
	legend('topleft', names(popinds)[c(1,3,5)], col=cols[c(1,3,5)], pch=19)
	dev.off()