Skip to content

Instantly share code, notes, and snippets.

@mbk0asis
Last active April 26, 2018 02:30
Show Gist options
  • Save mbk0asis/ded19ad3624b866d71fa6ff810814141 to your computer and use it in GitHub Desktop.
Save mbk0asis/ded19ad3624b866d71fa6ff810814141 to your computer and use it in GitHub Desktop.
library(DESeq2)
library(pcaExplorer)
library(ggplot2)
library(gplots)
library(ggrepel)
library(reshape2)
# set working directory
setwd('~/00-NGS/SETDB1_TCGA/GSE40419/')
# set parameters
type="ERV1"
TSS="1kb"
cutoff=10 #mean count cut-off
# open a count file
countsTable<- read.csv(paste("L1.ERV.rmsk.Promoter.",TSS,".counts",sep=""),row.names=1, header=T)
#countsTable <- read.csv(paste(type,".Promoter.",TSS,".counts",sep=""),row.names=1, header=T)
#countsTable<- read.csv("L1.ERV.rmsk.counts.csv",row.names=1, header=T)
head(countsTable)
dim(countsTable)
####################################################################
# Sort by SETDB1 expression
t.countsTable <- t(countsTable)
t.countsTable2 <- t.countsTable[order(-t.countsTable[,1]),]
setdb1<-t(t.countsTable2[,1])
setdb1
#barplot(setdb1,col=c(rep("orange",20),rep("darkgreen",20)),ylab="RPKM")
countsTable <- t(t.countsTable2[,-1])
head(countsTable)
###################################################################
###################################################################
# to cut off low count elements
cnts_noZero <- countsTable[rowMeans(abs(countsTable)) >= cutoff,]
dim(cnts_noZero)
####################################################################
conds<- factor(c(rep("SETDB1_High",20),rep("SETDB1_Low",20)))
coldat=DataFrame(conds=factor(conds))
dds <- DESeqDataSetFromMatrix(cnts_noZero, colData=coldat, design = ~ conds)
dds <- DESeq(dds, fitType = "mean")
cntNorm <- counts(dds, normalized=T)
###################################################################
#
# BOXPLOT
#
tiff(paste("Boxplot_",type,"_TSS_",TSS,"_meanCount_",cutoff,".tif", sep=""),units = "px",width = 1200, height = 600)
par(mfrow=c(1,2))
boxplot(log2(cntNorm+1), outline=T, #ylim=c(0,20),
ylab = "log2 counts", main=paste("Normalized, \nTSS < ",TSS,",\nmean count > ",cutoff),
col=c(rep("orange",20),rep("darkgreen",20)))
boxplot(log2(cnts_noZero+1), outline=T, #ylim=c(0,20),
ylab = "log2 counts", main=paste("RAW, \nTSS < ",TSS,",\nmean count > ",cutoff),
col=c(rep("orange",20),rep("darkgreen",20)))
dev.off()
###################################################################
#
# Heatmap-fold change
#
fc <- log2(cntNorm+1) - log2(rowMeans(cntNorm[,1:20])+1)
breaks = unique(c(seq(-1,0,length=75), seq(0,1,length=75)))
tiff(paste("Heatmap_",type,"_TSS_",TSS,"_meanCount_",cutoff,".tif", sep=""),units = "px",width = 450, height = 500)
heatmap.2(fc, main=paste("Relative to SETDB1 High mean, \nTSS < ",TSS,",\nmeanCount > ",cutoff),
trace="none",density="none", breaks = breaks,
col=bluered, Colv = F, margins = c(8,2),
ColSideColors = c(rep("orange",20),rep("darkgreen",20)))
dev.off()
###################################################################
#
# PCA
#
rld <- rlogTransformation(dds,fitType = "mean")
## Automatic
#pcaplot(rld,intgroup=c("conds"),ntop=500, text_labels=F, title="PCA",pcX=1,pcY=2,point_size=3,ellipse=F)
## Manual
# select top n elements with larger variations
ntop=5000
rv <- rowVars(assay(rld))
select <- order(rv, decreasing = TRUE)[seq_len(min(ntop,length(rv)))]
# run PCA
pca <- prcomp(t(assay(rld)[select,]))
percentVar <- pca$sdev^2/sum(pca$sdev^2)
# plot PCA result
df <- data.frame(PC1 = pca$x[, 1], PC2 = pca$x[, 2], group = conds, t(setdb1))
colnames(df) <-c("PC1","PC2","conds","Setdb1")
tiff(paste("PCA_",type,"_TSS_",TSS,"_meanCount_",cutoff,".tif", sep=""),units = "px",width = 400, height = 300)
ggplot(data = df, aes_string(x = "PC1", y = "PC2", size="Setdb1", color="Setdb1")) +
geom_point(alpha=.8) + theme_bw() +
xlab(paste0("PC1: ", round(percentVar[1] * 100), "% variance")) +
ylab(paste0("PC2: ", round(percentVar[2] * 100), "% variance")) +
scale_colour_gradient2(midpoint=40,low="green",mid="grey60",high="red", space = "Lab", na.value = "grey50", guide = "colourbar")
dev.off()
#
#vsd <- varianceStabilizingTransformation(dds, fitType = "mean")
#pcaplot(vsd,intgroup=c("conds"),text_labels=F,title="PCA",point_size=5,ellipse=F,pcX=1,pcY=2,ntop=100000)
####################################################################
#
# Pearson correlation
#
dta.cor <- round(cor(cntNorm,method="pearson"),5)
tiff(paste("Pearson_",type,"_TSS_",TSS,"_meanCount_",cutoff,".tif", sep=""),units = "px",width = 500, height = 500)
heatmap.2(dta.cor, main=paste("Pearson, \nTSS < ",TSS,",\nmean count > ",cutoff),
col=bluered,density="none",trace="none",
#cellnote = dta.cor,notecol="black",notecex=2.5,
Rowv = T,Colv = T, dendrogram="both", margins = c(8,8),
key.title="Pearson",key.xlab = NULL,key.ylab = NULL
,RowSideColors=c(rep("orange",20),
rep("darkgreen",20)),
ColSideColors=c(rep("orange",20),
rep("darkgreen",20)))
dev.off()
####################################################################
#
# DE analysis
#
res <- results(dds, contrast = c("conds","SETDB1_Low","SETDB1_High"))
#head(res)
#nrow(subset(res, pvalue<=.05 & log2FoldChange <= -1))
#nrow(subset(res, pvalue<=.05 & log2FoldChange >= 1))
res_sig <- subset(res, pvalue<=.05 & abs(log2FoldChange)>=1)
write.csv(res_sig,paste("DE_",type,"_TSS_",TSS,"_meanCount_",cutoff,".csv", sep=""))
# volcano plot
#res$pvalue[-log10(res$pvalue) > 10] <- 10 # outlier cutoff
#res$log2FoldChange[abs(res$log2FoldChange) > 5] <- 5 # outlier cutoff
tiff(paste("Volcano_",type,"_TSS_",TSS,"_meanCount_",cutoff,".tif", sep=""),units = "px",width = 350, height = 400)
with(res, plot(log2FoldChange,-log10(pvalue),
pch=19,cex=1,main="SETDB1 LOW(-) HIGH(+)",
col=rgb(.8,.8,.8,.3)))
with(subset(res, pvalue<=0.05 & log2FoldChange>=1),
points(log2FoldChange,-log10(pvalue),pch=19,cex=1,col=rgb(.8,.5,0,.8)))
with(subset(res, pvalue<=0.05 & log2FoldChange<=-1),
points(log2FoldChange,-log10(pvalue),pch=19,cex=1,col=rgb(0,.4,0,.8)))
abline(v=c(-1,1),col="blue",lty=2)
abline(v=0,col="blue")
abline(h=c(-log10(0.05)),col="blue",lty=2)
dev.off()
####################################################################
####################################################################
#
# Boxplot - mean expression distribution
#
high_mean <- rowMeans(cntNorm[,1:20])
low_mean <- rowMeans(cntNorm[,21:40])
df2<- data.frame(high_mean,low_mean)
head(df2)
wt <- wilcox.test(as.numeric(high_mean), as.numeric(low_mean))
pval <- round(wt$p.value,5) # pvalues
tiff(paste("Boxplot_",type,"_TSS_",TSS,"_meanCount_",cutoff,".tif", sep=""),units = "px",width = 300, height = 500)
par(mar=c(6,5,3,3), cex.main=0.5)
stripchart(log2(df2+1), main=paste(type,", TSS < ",TSS,", mean count > ",cutoff,",\nWilcoxon = ",pval,sep=""),
method='jitter', pch=20,cex=.9, vertical=T,ylab="log2(cntNorm + 1)",
col=(c(rgb(.8,.8,.3,.7),rgb(.3,.8,.8,.7))),axes = F,jitter=.2)
boxplot(log2(df2+1), las=2, col=(c(rgb(.8,.3,.3,.5),rgb(.3,.3,.8,.5))), add = T,
outline=F)
dev.off()
###################################################################
#
# k-means clustering
#
cl <- kmeans(fc,12)
cluster<-cl$cluster
table(cluster)
rNames <- rownames(fc)
df<-data.frame(fc, rNames, cluster) # attach cluster info on the data frame
mdata <- melt(df, id=c("rNames","cluster"))
head(mdata)
# plotting using ggplot2
p <- ggplot(mdata, aes(x=variable, y=value, group=rNames, colour=variable))
jit<- position_jitter (width = .08, height = .08)
p +
geom_line(position = jit, colour = alpha("black", 0.1)) + # individual data
stat_summary(aes(group = cluster), fun.y = mean, geom = "line", size = 1.5 ) + # mean values of each cluster
facet_wrap(~ cluster) # plots divided by the clusters
ggsave(filename=paste("kmeans_",type,"_TSS_",TSS,"_meanCount_",cutoff,".png", sep=""), width = 12, height = 9, dpi = 600)
write.csv(table(cluster),paste("kmeans_TSS_",TSS,"_meanCount_",cutoff,"_Table.csv", sep=""))
######################################################################################
#
# modified source code for PCA function
#
runPCA <- function (object, ...)
{
.local <- function (object,intgroup="condition",ntop=500,returnData=T)
{
rv <- rowVars(assay(object))
select <- order(rv, decreasing = TRUE)[seq_len(min(ntop,length(rv)))]
pca <- prcomp(t(assay(object)[select, ]))
percentVar <- pca$sdev^2/sum(pca$sdev^2)
if (!all(intgroup %in% names(colData(object)))) {
stop("the argument 'intgroup' should specify columns of colData(dds)")
}
intgroup.df <- as.data.frame(colData(object)[, intgroup,drop = FALSE])
group <- if (length(intgroup) > 1) {
factor(apply(intgroup.df, 1, paste, collapse = ":"))
}
else {
colData(object)[[intgroup]]
}
d <- data.frame(PC1 = pca$x[, 1], PC2 = pca$x[, 2], group = group,
intgroup.df, name = colnames(object))
if (returnData) {
attr(d, "percentVar") <- percentVar[1:2]
return(d)
}
#ggplot(data = d, aes_string(x = "PC1", y = "PC2", color = "group")) +
# geom_point(size = 5, alpha=0.8) + xlab(paste0("PC1: ", round(percentVar[1] * 100), "% variance")) +
# ylab(paste0("PC2: ", round(percentVar[2] * 100), "% variance")) + coord_fixed() +
# scale_color_manual(values=c("darkorange2", "darkgreen"))
}
.local(object, ...)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment