Last active
April 26, 2018 02:30
-
-
Save mbk0asis/ded19ad3624b866d71fa6ff810814141 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| library(DESeq2) | |
| library(pcaExplorer) | |
| library(ggplot2) | |
| library(gplots) | |
| library(ggrepel) | |
| library(reshape2) | |
| # set working directory | |
| setwd('~/00-NGS/SETDB1_TCGA/GSE40419/') | |
| # set parameters | |
| type="ERV1" | |
| TSS="1kb" | |
| cutoff=10 #mean count cut-off | |
| # open a count file | |
| countsTable<- read.csv(paste("L1.ERV.rmsk.Promoter.",TSS,".counts",sep=""),row.names=1, header=T) | |
| #countsTable <- read.csv(paste(type,".Promoter.",TSS,".counts",sep=""),row.names=1, header=T) | |
| #countsTable<- read.csv("L1.ERV.rmsk.counts.csv",row.names=1, header=T) | |
| head(countsTable) | |
| dim(countsTable) | |
| #################################################################### | |
| # Sort by SETDB1 expression | |
| t.countsTable <- t(countsTable) | |
| t.countsTable2 <- t.countsTable[order(-t.countsTable[,1]),] | |
| setdb1<-t(t.countsTable2[,1]) | |
| setdb1 | |
| #barplot(setdb1,col=c(rep("orange",20),rep("darkgreen",20)),ylab="RPKM") | |
| countsTable <- t(t.countsTable2[,-1]) | |
| head(countsTable) | |
| ################################################################### | |
| ################################################################### | |
| # to cut off low count elements | |
| cnts_noZero <- countsTable[rowMeans(abs(countsTable)) >= cutoff,] | |
| dim(cnts_noZero) | |
| #################################################################### | |
| conds<- factor(c(rep("SETDB1_High",20),rep("SETDB1_Low",20))) | |
| coldat=DataFrame(conds=factor(conds)) | |
| dds <- DESeqDataSetFromMatrix(cnts_noZero, colData=coldat, design = ~ conds) | |
| dds <- DESeq(dds, fitType = "mean") | |
| cntNorm <- counts(dds, normalized=T) | |
| ################################################################### | |
| # | |
| # BOXPLOT | |
| # | |
| tiff(paste("Boxplot_",type,"_TSS_",TSS,"_meanCount_",cutoff,".tif", sep=""),units = "px",width = 1200, height = 600) | |
| par(mfrow=c(1,2)) | |
| boxplot(log2(cntNorm+1), outline=T, #ylim=c(0,20), | |
| ylab = "log2 counts", main=paste("Normalized, \nTSS < ",TSS,",\nmean count > ",cutoff), | |
| col=c(rep("orange",20),rep("darkgreen",20))) | |
| boxplot(log2(cnts_noZero+1), outline=T, #ylim=c(0,20), | |
| ylab = "log2 counts", main=paste("RAW, \nTSS < ",TSS,",\nmean count > ",cutoff), | |
| col=c(rep("orange",20),rep("darkgreen",20))) | |
| dev.off() | |
| ################################################################### | |
| # | |
| # Heatmap-fold change | |
| # | |
| fc <- log2(cntNorm+1) - log2(rowMeans(cntNorm[,1:20])+1) | |
| breaks = unique(c(seq(-1,0,length=75), seq(0,1,length=75))) | |
| tiff(paste("Heatmap_",type,"_TSS_",TSS,"_meanCount_",cutoff,".tif", sep=""),units = "px",width = 450, height = 500) | |
| heatmap.2(fc, main=paste("Relative to SETDB1 High mean, \nTSS < ",TSS,",\nmeanCount > ",cutoff), | |
| trace="none",density="none", breaks = breaks, | |
| col=bluered, Colv = F, margins = c(8,2), | |
| ColSideColors = c(rep("orange",20),rep("darkgreen",20))) | |
| dev.off() | |
| ################################################################### | |
| # | |
| # PCA | |
| # | |
| rld <- rlogTransformation(dds,fitType = "mean") | |
| ## Automatic | |
| #pcaplot(rld,intgroup=c("conds"),ntop=500, text_labels=F, title="PCA",pcX=1,pcY=2,point_size=3,ellipse=F) | |
| ## Manual | |
| # select top n elements with larger variations | |
| ntop=5000 | |
| rv <- rowVars(assay(rld)) | |
| select <- order(rv, decreasing = TRUE)[seq_len(min(ntop,length(rv)))] | |
| # run PCA | |
| pca <- prcomp(t(assay(rld)[select,])) | |
| percentVar <- pca$sdev^2/sum(pca$sdev^2) | |
| # plot PCA result | |
| df <- data.frame(PC1 = pca$x[, 1], PC2 = pca$x[, 2], group = conds, t(setdb1)) | |
| colnames(df) <-c("PC1","PC2","conds","Setdb1") | |
| tiff(paste("PCA_",type,"_TSS_",TSS,"_meanCount_",cutoff,".tif", sep=""),units = "px",width = 400, height = 300) | |
| ggplot(data = df, aes_string(x = "PC1", y = "PC2", size="Setdb1", color="Setdb1")) + | |
| geom_point(alpha=.8) + theme_bw() + | |
| xlab(paste0("PC1: ", round(percentVar[1] * 100), "% variance")) + | |
| ylab(paste0("PC2: ", round(percentVar[2] * 100), "% variance")) + | |
| scale_colour_gradient2(midpoint=40,low="green",mid="grey60",high="red", space = "Lab", na.value = "grey50", guide = "colourbar") | |
| dev.off() | |
| # | |
| #vsd <- varianceStabilizingTransformation(dds, fitType = "mean") | |
| #pcaplot(vsd,intgroup=c("conds"),text_labels=F,title="PCA",point_size=5,ellipse=F,pcX=1,pcY=2,ntop=100000) | |
| #################################################################### | |
| # | |
| # Pearson correlation | |
| # | |
| dta.cor <- round(cor(cntNorm,method="pearson"),5) | |
| tiff(paste("Pearson_",type,"_TSS_",TSS,"_meanCount_",cutoff,".tif", sep=""),units = "px",width = 500, height = 500) | |
| heatmap.2(dta.cor, main=paste("Pearson, \nTSS < ",TSS,",\nmean count > ",cutoff), | |
| col=bluered,density="none",trace="none", | |
| #cellnote = dta.cor,notecol="black",notecex=2.5, | |
| Rowv = T,Colv = T, dendrogram="both", margins = c(8,8), | |
| key.title="Pearson",key.xlab = NULL,key.ylab = NULL | |
| ,RowSideColors=c(rep("orange",20), | |
| rep("darkgreen",20)), | |
| ColSideColors=c(rep("orange",20), | |
| rep("darkgreen",20))) | |
| dev.off() | |
| #################################################################### | |
| # | |
| # DE analysis | |
| # | |
| res <- results(dds, contrast = c("conds","SETDB1_Low","SETDB1_High")) | |
| #head(res) | |
| #nrow(subset(res, pvalue<=.05 & log2FoldChange <= -1)) | |
| #nrow(subset(res, pvalue<=.05 & log2FoldChange >= 1)) | |
| res_sig <- subset(res, pvalue<=.05 & abs(log2FoldChange)>=1) | |
| write.csv(res_sig,paste("DE_",type,"_TSS_",TSS,"_meanCount_",cutoff,".csv", sep="")) | |
| # volcano plot | |
| #res$pvalue[-log10(res$pvalue) > 10] <- 10 # outlier cutoff | |
| #res$log2FoldChange[abs(res$log2FoldChange) > 5] <- 5 # outlier cutoff | |
| tiff(paste("Volcano_",type,"_TSS_",TSS,"_meanCount_",cutoff,".tif", sep=""),units = "px",width = 350, height = 400) | |
| with(res, plot(log2FoldChange,-log10(pvalue), | |
| pch=19,cex=1,main="SETDB1 LOW(-) HIGH(+)", | |
| col=rgb(.8,.8,.8,.3))) | |
| with(subset(res, pvalue<=0.05 & log2FoldChange>=1), | |
| points(log2FoldChange,-log10(pvalue),pch=19,cex=1,col=rgb(.8,.5,0,.8))) | |
| with(subset(res, pvalue<=0.05 & log2FoldChange<=-1), | |
| points(log2FoldChange,-log10(pvalue),pch=19,cex=1,col=rgb(0,.4,0,.8))) | |
| abline(v=c(-1,1),col="blue",lty=2) | |
| abline(v=0,col="blue") | |
| abline(h=c(-log10(0.05)),col="blue",lty=2) | |
| dev.off() | |
| #################################################################### | |
| #################################################################### | |
| # | |
| # Boxplot - mean expression distribution | |
| # | |
| high_mean <- rowMeans(cntNorm[,1:20]) | |
| low_mean <- rowMeans(cntNorm[,21:40]) | |
| df2<- data.frame(high_mean,low_mean) | |
| head(df2) | |
| wt <- wilcox.test(as.numeric(high_mean), as.numeric(low_mean)) | |
| pval <- round(wt$p.value,5) # pvalues | |
| tiff(paste("Boxplot_",type,"_TSS_",TSS,"_meanCount_",cutoff,".tif", sep=""),units = "px",width = 300, height = 500) | |
| par(mar=c(6,5,3,3), cex.main=0.5) | |
| stripchart(log2(df2+1), main=paste(type,", TSS < ",TSS,", mean count > ",cutoff,",\nWilcoxon = ",pval,sep=""), | |
| method='jitter', pch=20,cex=.9, vertical=T,ylab="log2(cntNorm + 1)", | |
| col=(c(rgb(.8,.8,.3,.7),rgb(.3,.8,.8,.7))),axes = F,jitter=.2) | |
| boxplot(log2(df2+1), las=2, col=(c(rgb(.8,.3,.3,.5),rgb(.3,.3,.8,.5))), add = T, | |
| outline=F) | |
| dev.off() | |
| ################################################################### | |
| # | |
| # k-means clustering | |
| # | |
| cl <- kmeans(fc,12) | |
| cluster<-cl$cluster | |
| table(cluster) | |
| rNames <- rownames(fc) | |
| df<-data.frame(fc, rNames, cluster) # attach cluster info on the data frame | |
| mdata <- melt(df, id=c("rNames","cluster")) | |
| head(mdata) | |
| # plotting using ggplot2 | |
| p <- ggplot(mdata, aes(x=variable, y=value, group=rNames, colour=variable)) | |
| jit<- position_jitter (width = .08, height = .08) | |
| p + | |
| geom_line(position = jit, colour = alpha("black", 0.1)) + # individual data | |
| stat_summary(aes(group = cluster), fun.y = mean, geom = "line", size = 1.5 ) + # mean values of each cluster | |
| facet_wrap(~ cluster) # plots divided by the clusters | |
| ggsave(filename=paste("kmeans_",type,"_TSS_",TSS,"_meanCount_",cutoff,".png", sep=""), width = 12, height = 9, dpi = 600) | |
| write.csv(table(cluster),paste("kmeans_TSS_",TSS,"_meanCount_",cutoff,"_Table.csv", sep="")) | |
| ###################################################################################### | |
| # | |
| # modified source code for PCA function | |
| # | |
| runPCA <- function (object, ...) | |
| { | |
| .local <- function (object,intgroup="condition",ntop=500,returnData=T) | |
| { | |
| rv <- rowVars(assay(object)) | |
| select <- order(rv, decreasing = TRUE)[seq_len(min(ntop,length(rv)))] | |
| pca <- prcomp(t(assay(object)[select, ])) | |
| percentVar <- pca$sdev^2/sum(pca$sdev^2) | |
| if (!all(intgroup %in% names(colData(object)))) { | |
| stop("the argument 'intgroup' should specify columns of colData(dds)") | |
| } | |
| intgroup.df <- as.data.frame(colData(object)[, intgroup,drop = FALSE]) | |
| group <- if (length(intgroup) > 1) { | |
| factor(apply(intgroup.df, 1, paste, collapse = ":")) | |
| } | |
| else { | |
| colData(object)[[intgroup]] | |
| } | |
| d <- data.frame(PC1 = pca$x[, 1], PC2 = pca$x[, 2], group = group, | |
| intgroup.df, name = colnames(object)) | |
| if (returnData) { | |
| attr(d, "percentVar") <- percentVar[1:2] | |
| return(d) | |
| } | |
| #ggplot(data = d, aes_string(x = "PC1", y = "PC2", color = "group")) + | |
| # geom_point(size = 5, alpha=0.8) + xlab(paste0("PC1: ", round(percentVar[1] * 100), "% variance")) + | |
| # ylab(paste0("PC2: ", round(percentVar[2] * 100), "% variance")) + coord_fixed() + | |
| # scale_color_manual(values=c("darkorange2", "darkgreen")) | |
| } | |
| .local(object, ...) | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment