Skip to content

Instantly share code, notes, and snippets.

@mbk0asis
Created August 30, 2019 05:09
Show Gist options
  • Save mbk0asis/aaee2021f2a58b609a8ee86094206f75 to your computer and use it in GitHub Desktop.
Save mbk0asis/aaee2021f2a58b609a8ee86094206f75 to your computer and use it in GitHub Desktop.
BiocUpgrade()
source("http://bioconductor.org/biocLite.R")
biocLite(c("DESeq2","gplots","pcaExplorer","bovine.db","calibrate","AnnotationFuncs","gage","Rtsne","ggrepel"))
#library(TCGAbiolinks)
library(DESeq2)
library(pcaExplorer)
library(ggfortify)
library(ggplot2)
library(gplots)
library(ggrepel)
library(gage)
library(Rtsne)
library(reshape2)
library(tidyverse)
library(org.Hs.eg.db)
wdir<-"~/BIO0/00-NGS/SETDB1_TCGA/"
setwd(wdir)
luad_tumor <- read.csv("~/BIO0/00-NGS/SETDB1_TCGA/SETDB1_Top100_Bot100_LUAD/exp.LUAD.TUMOR.SETDB1.Top100.Bot100.htseq-count.csv", header=T, row.names = 1)
luad_normal <- read.csv("~/BIO0/00-NGS/SETDB1_TCGA/SETDB1_Top100_Bot100_LUAD/exp.LUAD.NORMAL.htseq-counts.csv", header=T, row.names = 1)
lusc_tumor <- read.csv("~/BIO0/00-NGS/SETDB1_TCGA/SETDB1_Top100_Bot100_LUSC/exp.LUSC.TUMOR.Setdb1.Top100.Bot100.htseq-count.csv", header=T, row.names = 1)
lusc_normal <- read.csv("~/BIO0/00-NGS/SETDB1_TCGA/SETDB1_Top100_Bot100_LUSC/exp.LUSC.NORMAL.htseq-counts.csv", header=T, row.names = 1)
nLUAD_Tumor <- ncol(luad_tumor)
nLUSC_Tumor <- ncol(lusc_tumor)
nLUAD_Normal <- ncol(luad_normal)
nLUSC_Normal <- ncol(lusc_normal)
Normal <- nLUAD_Normal + nLUSC_Normal
cnts <- data.frame(luad_tumor,lusc_tumor,luad_normal,lusc_normal)
head(cnts[,1:5])
dim(cnts)
# To remove all zero rows
cnts_noZero <- cnts[rowMeans(abs(cnts))>0,]
dim(cnts_noZero)
conds <- c(rep("luad_top100",100),rep("luad_bot100",100),rep("lusc_top100",100),rep("lusc_bot100",100),
rep("luad_normal",nLUAD_Normal),rep("lusc_normal",nLUSC_Normal))
conds <- c(rep("luad_top100",100),rep("luad_bot100",100),
rep("lusc_top100",100),rep("lusc_bot100",100),
rep("normal",108))
coldat=DataFrame(conds=factor(conds))
coldat
dds <- DESeqDataSetFromMatrix(cnts_noZero, colData=coldat, design = ~ conds)
dds <- DESeq(dds, fitType = "mean")
dds
# to obtain normalized count data
cntNorm <- counts(dds, normalized=T)
#write.csv(cntNorm,"cntNorm.LUSC.Setdb1.Top100.Bot100.csv")
############################################################################################################
# to select Setdb1 expression Top100/Bot100
##############################################
# ggplot
setdb1 <- "ENSG00000143379"
exp.setdb1 <- cntNorm[grep(setdb1,rownames(cntNorm)),]
names(exp.setdb1)
type1 <- c(rep("luad_tumor",nLUAD_Tumor),
rep("lusc_tumor",nLUSC_Tumor),
rep("luad_normal",nLUAD_Normal),
rep("lusc_normal",nLUSC_Normal))
type2 <- c(rep("luad_tumor",nLUAD_Tumor),
rep("lusc_tumor",nLUSC_Tumor),
rep("normal",nLUAD_Normal + nLUSC_Normal))
m.exp.setdb1 <- melt(exp.setdb1)
m.exp.setdb1$type1 <- type1
m.exp.setdb1$type2 <- type2
m.exp.setdb1$type1 <- factor(m.exp.setdb1$type1, levels = c("luad_tumor","lusc_tumor","luad_normal","lusc_normal"))
m.exp.setdb1.sorted <- m.exp.setdb1[order(type1,-m.exp.setdb1$value),]
aggregate(value ~ type1, data=m.exp.setdb1.sorted, FUN = function(x){NROW(x)})
ggplot(m.exp.setdb1.sorted, aes(x=type1, y=log2(value))) +
theme_bw() +
geom_jitter(width=.1, alpha=.8, aes(color=type1)) +
#geom_violin(aes(fill=type2), alpha=.5) +
geom_boxplot(width=.4, alpha=.5, outlier.shape = NA) +
scale_color_brewer(palette="Set1")
###############################################
# Generic boxplot
nSamples=100 # number of samples to be subsetted
nCol <- ncol(cntNorm)
setdb1 <- "ENSG00000143379"
exp.setdb1 <- cntNorm_Tumor[grep(setdb1,rownames(cntNorm)),]
exp.setdb1.Tumor.sorted <- data.frame(exp.setdb1.Tumor[order(exp.setdb1.Tumor)],exp.setdb1.Tumor[order(exp.setdb1.Tumor)])
colnames(exp.setdb1.Tumor.sorted) <- rep("SETDB1",2)
dim(exp.setdb1.Tumor.sorted)
head(exp.setdb1.Tumor.sorted)
# LUAD_Tumor
meanHIGH = log2(colMeans(exp.setdb1.Tumor.sorted[topStart:nTumor,]))
meanLOW = log2(colMeans(exp.setdb1.Tumor.sorted[1:nSamples,]))
high <- tail(exp.setdb1.Tumor.sorted,nSamples)
nHigh <- nrow(high)
low <- head(exp.setdb1.Tumor.sorted,nSamples)
nLow <- nrow(low)
log2FC_LUAD_Tumor <- round(meanHIGH[1]-meanLOW[1],2)
# LUSC_Tumor
cntNorm_Tumor <- cntNorm[,1:nTumor]
dim(cntNorm_Tumor)
exp.setdb1.Tumor <- cntNorm_Tumor[grep(setdb1,rownames(cntNorm_Tumor)),]
exp.setdb1.Tumor.sorted <- data.frame(exp.setdb1.Tumor[order(exp.setdb1.Tumor)],exp.setdb1.Tumor[order(exp.setdb1.Tumor)])
colnames(exp.setdb1.Tumor.sorted) <- rep("SETDB1",2)
dim(exp.setdb1.Tumor.sorted)
head(exp.setdb1.Tumor.sorted)
meanHIGH = log2(colMeans(exp.setdb1.Tumor.sorted[topStart:nTumor,]))
meanLOW = log2(colMeans(exp.setdb1.Tumor.sorted[1:nSamples,]))
high <- tail(exp.setdb1.Tumor.sorted,nSamples)
nHigh_lusc_tumor <- nrow(high)
low <- head(exp.setdb1.Tumor.sorted,nSamples)
nLow_lusc_tumor <- nrow(low)
log2FC_LUSC_Tumor <- round(meanHIGH[1]-meanLOW[1],2)
nLUAD_Tumor <- ncol(luad_tumor)
nLUAD_Normal <- ncol(luad_normal)
nLUSC_Tumor <- ncol(lusc_tumor)
nLUSC_Normal <- ncol(lusc_normal)
# Normal
cntNorm_Normal <- cntNorm[,(nTumor+1):nCol]
exp.setdb1.Normal <- cntNorm_Normal[grep(setdb1,rownames(cntNorm_Normal)),]
exp.setdb1.Normal.sorted <- data.frame(exp.setdb1.Normal[order(exp.setdb1.Normal)],exp.setdb1.Normal[order(exp.setdb1.Normal)])
colnames(exp.setdb1.Normal.sorted) <- rep("SETDB1",2)
mean_Normal <- round(log2(colMeans(exp.setdb1.Normal.sorted)),2)
# Boxplots
par(mfrow=c(1,2))
boxplot(log2(exp.setdb1.Tumor.sorted[,1]),outline=T,pars=list(outcol="white"),col=(rgb(1,1,1,0)), ylim=c(10,14),#axes=F,
main=paste("LUSC Tumor\nSETDB1 Top",nSamples,"vs Bot",nSamples,"\nlog2FC = ",log2FC,"(~",round(2^log2FC,2),")"),
ylab = "log2(cntNorm)")
stripchart(log2(high),vertical=T, method = 'jitter', add=T,
pch=20, cex=.9, col=(rgb(.8,.2,.2,.5)))
#stripchart(log2(mid),vertical=T, method = 'jitter', add=T,
# pch=20, cex=.9, col=(rgb(.8,.8,.8,.7)))
stripchart(log2(low),vertical=T, method = 'jitter', add=T,
pch=20, cex=.9, col=(rgb(.2,.2,.8,.5)))
points(meanHIGH[1],pch=7,lwd=4)
text(meanHIGH[1],paste("mean_HIGH\n= ",round(meanHIGH[1],2)),offset=2,pos=4)
points(meanLOW[1],pch=7,lwd=4)
text(meanLOW[1],paste("mean_LOW\n= ",round(meanLOW[1],2)),offset=2,pos=4)
#write.csv(high[,1:2], paste("LUSC.SETDB1.Top",nSamples,".samples",sep=""))
#write.csv(low[,1:2], paste("LUSC.SETDB1.Bot",nSamples,".samples",sep=""))
boxplot(log2(exp.setdb1.Normal.sorted[,1]),outline=T,bty=F,
pars=list(outcol="white"),col=(rgb(1,1,1,0)),ylim=c(10,14),
main = "LUSC Normal")
stripchart(log2(exp.setdb1.Normal.sorted),vertical=T, method = 'jitter', add=T,
pch=20, cex=.9, col=(rgb(.3,.3,.3,.8)))
points(mean_Normal,pch=7,lwd=4)
text(mean_Normal,paste("mean_Norm\n= ",round(mean_Normal,2)), offset=3, pos=4)
#dev.off()
#
############################################################################################################
#
# PCA plot
rld <- rlogTransformation(dds,fitType = "mean")
#pcaplot(rld,intgroup=c("conds"),ntop=50000, text_labels=F, title="PCA",
# pcX=1,pcY=2,point_size=3, ellipse = TRUE)
vsd <- varianceStabilizingTransformation(dds, fitType = "mean")
X=1
Y=2
ntop=30000
pcaData <- pcaplot(vsd, intgroup=c("conds"), pcX=X, pcY=Y, returnData=TRUE, ntop = ntop)
percentVar <- round(100 * attr(pcaData, "percentVar"))
pcaData$conds <- factor(pcaData$conds, levels = c('luad_top100','luad_bot100','lusc_top100','lusc_bot100','normal'))
ggplot(pcaData, aes(pcaData[,1], pcaData[,2], color=conds)) + theme_bw() +
geom_point(size=3, alpha = .7) + #ggtitle("PCA - LUAD & LUSC\nSETDB1 Top100 vs Bot100 vs Normal") +
xlab(paste0("PC",X,": ",percentVar[1],"% variance")) +
ylab(paste0("PC",Y,": ",percentVar[2],"% variance")) +
#coord_fixed() +
scale_color_manual(values=c("blue","red","orange","#00BFC4","yellow4","yellow4")) +
stat_ellipse(aes(x=pcaData[,1], y=pcaData[,2]),type = "t")
########################################################
# t-SNE
dta <- assay(vsd)
head(dta)
dta_sorted <- dta[order(-rowMeans(dta)),]
dta_sorted[5,1:5]
conds <- c(rep("luad_top100",100),rep("luad_bot100",100),
rep("lusc_top100",100),rep("lusc_bot100",100),rep("normal",108))
goi <- data.frame(dta[grepl("ENSG00000143379", rownames(dta)), ])
colnames(goi) <-"goi" # gene of interset
mid<-mean(goi)
head(goi)
d <- dist(t(dta_sorted[1:30000,]))
for ( p in seq(10,50,10)){
set.seed(50) # tsne has some stochastic steps (gradient descent) so need to set random
perplexity = 30
tsne_out <- Rtsne(d, is_distance=T, perplexity=perplexity,
num_threads=20, verbose=TRUE)
##
dta2 <- data.frame(tsne_out$Y, conds, goi)
ggplot(dta2, aes(x=dta2[,1],y=dta2[,2], color=goi)) +#, color=conds)) +
theme_bw() +
geom_point(alpha=.5, size = 3) +
xlab(paste0("Dimension 1")) + ylab(paste0("Dimension 2")) +
scale_color_gradient2(midpoint=mid,low="steelblue",mid="white",high="red",space ="Lab")
#scale_color_manual(values=c("blue","red","#00BFC4","orange","yellow4"))
#stat_ellipse(aes(x=dta2[,1], y=dta2[,2]),type = "t")
ggsave(paste0("tSNE_LUAD_LUSC_SETDB1_Top.Bot100_Normal_perplexity_",perplexity,".png"),
width = 6, height = 4.5, units = "in")
}
'''
###########################################################
# DE analysis
Sample <- "top100"
Control <- "bot100"
mean_top100 <- rowMeans(cntNorm[,1:100])
mean_bot100 <- rowMeans(cntNorm[,101:200])
sd_top100 <- rowSds(cntNorm[,1:100])
sd_bot100 <- rowSds(cntNorm[,101:200])
res <- results(dds, contrast = c("conds",Sample,Control))
res$mean_top100 <- mean_top100
res$sd_top100 <- sd_top100
res$mean_bot100 <- mean_bot100
res$sd_bot100 <- sd_bot100
res$symbol = mapIds(org.Hs.eg.db,
keys=row.names(res),
column="SYMBOL",
keytype="ENSEMBL",
multiVals="first")
# add entrez ID to res
res$entrez = mapIds(org.Hs.eg.db,
keys=row.names(res),
column="ENTREZID",
keytype="ENSEMBL",
multiVals="first")
head(res)
#res2 <- res[,c(2,5:11)]
#dim(res2)
#write.csv(res2, "Whole_Gene_DE_results.csv")
# sig DEGs
res_sig <- subset(res, padj<=.00001 & abs(log2FoldChange)>=1)
head(res_sig)
dim(res_sig)
#write.csv(res_sig,"sig_DEGs_LUSC_SETDB1_Top100_Bot100_q0.00001_fc2.csv")
# volcano plot
nTop100 <- nrow(subset(res, padj<=.00001 & log2FoldChange > 1))
nBot100 <- nrow(subset(res, padj<=.00001 & log2FoldChange < -1))
nTop100
nBot100
res$padj[res$padj <= 1e-20] <- 1e-20
res$log2FoldChange[abs(res$log2FoldChange) > 6] <- 6 # outlier cutoff
with(res, plot(log2FoldChange,-log10(padj),
pch=19,cex=0.5,main=paste("TCGA LUSC\nSETDB1 (q<0.00001, fc>2)\n Bot100 (",nBot100,") vs Top100 (",nTop100,")",sep=""),
col="lightgrey"))
with(subset(res, padj<=.00001 & log2FoldChange > 1),
points(log2FoldChange,-log10(padj),pch=19,cex=0.7,col=rgb(1,.6,0,.7)))
with(subset(res, padj<=.00001 & log2FoldChange < -1),
points(log2FoldChange,-log10(padj),pch=19,cex=0.7,col=rgb(.2,.6,.2,.7)))
abline(v=c(-1,1),col="blue",lty=2)
abline(v=0,col="blue")
abline(h=c(-log10(.00001)),col="blue",lty=2)
##################################################################################
##################################################################################
# GAGE
library(gage)
data(korg) # available KEGG databases
head(korg,50)
data(bods) # available GO databases
head(bods,50)
########################################
# KEGG
# KEGG gene sets
kg.bta=kegg.gsets("hsa")
kegg.gs=kg.bta$kg.sets[kg.bta$sigmet.idx]
# to export KEGG gene sets
#kegg.gs.table <- plyr::ldply(kegg.gs, rbind)
#write.csv(kegg.gs.table,"kegg.gs.table.csv")
#
foldchanges = res$log2FoldChange
names(foldchanges) = res_sig$entrez
kegg.p<-gage(foldchanges, gsets=kegg.gs, same.dir=T, compare='unpaired')
head(kegg.p$greater[,c(3,4)])
head(kegg.p$less[,c(3,4)])
write.table(kegg.p$greater[,c(3,4)],"gage_KEGG_SETDB1_Top100.txt")
write.table(kegg.p$less[,c(3,4)],"gage_KEGG_SETDB1_Bot100.txt")
# PATHVIEW
library(pathview)
cnts.d = log2(rowMeans(cntNorm_ent[, ft.idx]))-log2(rowMeans(cntNorm_ent[, fi.idx]))
sel <- kegg.p$greater[, "p.val"] < 0.05 & !is.na(kegg.p$greater[,"p.val"])
path.ids <- rownames(kegg.p$greater)[sel]
sel.l <- kegg.p$less[, "p.val"] < 0.05 & !is.na(kegg.p$less[,"p.val"])
path.ids.l <- rownames(kegg.p$less)[sel.l]
path.ids2 <- substr(c(path.ids, path.ids.l), 1, 8)
pv.out.list <- sapply(path.ids2, function(pid) pathview(gene.data = cnts.d, pathway.id = pid, species = "bta", kegg.native = T))
########################################
# Gene Ontology
## GO gene sets for Human
go.gs=go.gsets(species="Human")
go.bp=go.gs$go.sets[go.gs$go.subs$BP]
go.cc=go.gs$go.sets[go.gs$go.subs$CC]
go.mf=go.gs$go.sets[go.gs$go.subs$MF]
foldchanges = res$log2FoldChange
names(foldchanges) = res$entrez
head(foldchanges)
go.p<-gage(foldchanges, gsets=go.bp, same.dir=T, compare='unpaired')
head(go.p$greater[,c(3,4)])
head(go.p$less[,c(3,4)])
go_ampl <- go.p$greater[,3:4]
go_ampl_sig <- subset(go_ampl,go_ampl[,1] <= 0.05)
dim(go_ampl_sig)
head(go_ampl_sig)
tail(go_ampl_sig)
write.csv(go_ampl_sig,"gage_GO_BP_LUSC_SETDB1_Top100.csv")
go_dipl <- go.p$less[,3:4]
go_dipl_sig <- subset(go_dipl, go_dipl[,1] <= 0.05)
dim(go_dipl_sig)
head(go_dipl_sig)
tail(go_dipl_sig)
write.csv(go_dipl_sig,"gage_GO_BP_LUSC_SETDB1_Bot100.csv")
##############
foldchanges = data.frame(res$entrez, res$log2FoldChange, res$padj)
res.ampl <- subset(foldchanges, res.padj<=.05 & res.log2FoldChange >= 1)
res.dipl <- subset(foldchanges, res.padj<=.05 & res.log2FoldChange <= -1)
res.ampl <- subset(foldchanges,res.log2FoldChange >= 1)
res.dipl <- subset(foldchanges,res.log2FoldChange <= -1)
head(res.dipl)
terms <- read.csv("terms", header = F)
for (t in terms[,1]){
print(t)
glist<-data.frame(unlist(go.bp[t]))
colnames(glist) <- "entrez"
dim(glist)
dim(res.ampl)
overlapGenes <- merge(glist, foldchanges, by.x="entrez",by.y="res.entrez")
overlapGenes2 <- subset(overlapGenes, res.padj <= 0.05)
genes <- mapIds(org.Hs.eg.db,
keys=as.character(overlapGenes2[,1]),
keytype="ENTREZID",
column="SYMBOL")
genes <- paste(genes,collapse=",")
print(genes)
# output[[t]] <- paste(t,genes, sep=";")
}
output
########################################
# MSigDB
# MSigDB gene sets - GO_BP_symbol
filename=("msigdb.v6.1.symbols.gmt")
msig.gs=readList(filename)
foldchanges = res$log2FoldChange
names(foldchanges) = res$symbol
head(foldchanges)
ampl.LUSC <- cntNorm[,1:74]
dipl.LUSC <- cntNorm[,75:206]
msig.p<-gage(foldchanges, gsets=msig.gs, same.dir=T, compare='unpaired')
msig.greater <- msig.p$greater[,c(3,4)]
go.msig.greater <- msig.greater[grep("GO_", rownames(msig.greater)),]
sig.go.msig.greater <- subset(go.msig.greater, go.msig.greater[,2] <= 0.05)
dim(sig.go.msig.greater)
head(sig.go.msig.greater,20)
msig.less <- msig.p$less[,c(3,4)]
go.msig.less <- msig.less[grep("GO_", rownames(msig.less)),]
sig.go.msig.less <- subset(go.msig.less, go.msig.less [,2] <= 0.05)
dim(sig.go.msig.less)
head(sig.go.msig.less,20)
write.csv(sig.go.msig.greater,"gage_MSig_BP_Setdb1_Top100.csv")
write.csv(sig.go.msig.less,"gage_MSig_BP_Setdb1_Bot100.csv")
########################
foldchanges = data.frame(res$symbol, res$log2FoldChange, res$padj)
head(foldchanges)
#res.ampl <- subset(foldchanges, res.padj<=.05 & res.log2FoldChange >= 1)
#res.dipl <- subset(foldchanges, res.padj<=.05 & res.log2FoldChange <= -1)
res.ampl <- subset(foldchanges, res.padj<=.05)
res.dipl <- subset(foldchanges, res.padj<=.05)
#res.ampl <- subset(foldchanges,res.log2FoldChange >= 1)
#res.dipl <- subset(foldchanges,res.log2FoldChange <= -1)
terms <- row.names(sig.go.msig.greater)
terms <- row.names(sig.go.msig.less)
output <- list()
for (t in terms){
# print(t)
glist<-data.frame(unlist(msig.gs[t]))
colnames(glist) <- "symbol"
overlapGenes <- merge(glist, res.ampl, by.x="symbol",by.y="res.symbol")
genes <- paste(overlapGenes$symbol,collapse=",")
# print(genes)
output[[t]] <- paste(genes, sep=";")
}
write.csv(data.frame(unlist(output)),"gage_MSig_ampl.csv")
###############################################################################
# survival plot
surv <- read.csv("Clinical.Setdb1.Top100.Bot100.csv", header = T)
surv2 <- surv[,c(8,11,12)]
setdb1 <- c(rep("top100",116), rep("bot100",118))
surv <- read.csv("Clinical.Setdb1.Top100.Bot100.uniq.csv", header = T)
head(surv)
tail(surv)
surv2 <- surv[,c(2:4)]
head(surv2)
dim(surv2)
setdb1 <- c(rep("top100",99), rep("bot100",100))
dta <- data.frame(surv2, setdb1)
head(dta)
tail(dta)
TCGAanalyze_survival(dta, clusterCol="setdb1",risk.table = FALSE,
height = 4, width = 6, dpi = 300,
conf.int = T, xlim = 4000,
color = c("blue","red"))
###############################################################################
# TE / Epi-drivers - PCA/heatmaps
te <- "ERVL.3kb"
dta <- read.csv( paste("cntNorm.LUSC.Setdb1.Top100.Bot100.Ensembl_Genes.TSS.5kb.",te,".csv",sep=""), row.names = 1, header=F)
dta <- read.csv("cntNorm.LUSC.Setdb1.Top100.Bot100.EpiDrivers.csv", row.names = 1, header=F)
head(dta[,1:5])
nGenes <- nrow(dta)
mean_bot100 <- rowMeans(dta[,101:200])
fc <- log2(dta + 1) - log2(mean_bot100 + 1)
my_palette <- colorRampPalette(c("steelblue", "lightyellow", "red"))
breaks = unique(c(seq(-2,0,length=75), seq(0,2,length=75)))
heatmap.2(as.matrix(fc),density="none",trace="none", col="bluered", breaks=breaks,
Rowv = T, Colv = T, dendrogram="row", margins = c(8,8),
ColSideColors = c(rep("orange",100),rep("darkgreen",100)),
RowSideColors = c(rep("white",95),"black",rep("white",63)),
main = paste("EpiDrivers (",nGenes,")",sep=""))
main = paste("TSS < 5kb,\n",te," (",nGenes,")",sep=""))
# PCA
dta <- read.csv("cntNorm.LUSC.Setdb1.Top100.Bot100.EpiDrivers.csv", row.names = 1, header=F)
head(dta[,1:5])
dta.sorted <- dta[order(rowMeans(dta),decreasing=T), ]
head(dta.sorted[,1:5])
pc <- prcomp(t(dta.sorted[1:100,]))
autoplot(pc, data= sampleInfo,
colour="conds", alpha=.5,
size=2) + theme_bw()
coord_cartesian(xlim = c(-0.05, 0.1), ylim = c(-0.1,0.1))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment