Last active
December 19, 2015 22:48
-
-
Save massyah/6029531 to your computer and use it in GitHub Desktop.
R code to annotate genes (specified by their official symbol) with GO terms, as well as example usage.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
source("go_annotations.R") | |
# Get the GO BP/MF/CC terms ids associated with STAT3 | |
gene2gos("STAT3") | |
# Get the GO BP/MF/CC terms description associated with STAT3 | |
go2terms(gene2gos("STAT3")) | |
# union of go terms for a list of genes | |
go2terms(gene2gos(c("STAT3","STAT2"))) | |
# To select go terms of interest, either go to AMIGO, or build a mapping from terms to GO ID | |
# This is very slow, has to be done once | |
go.terms <- as.data.frame(GOTERM) | |
unique(go.terms[grep("DNA binding",go.terms$Term),c("go_id","Term")]) | |
# Once we know the ID of genes of interest, let's annotate a list of genes with them | |
# We consider a gene to be associated with a GO term if it's associated with it or with any of its descendants | |
# For the exemple, we chose: | |
# DNA binding corresponds to GO:0003677, | |
# Protein binding to GO:0005515 | |
# Insuling binding to GO:0043559 | |
annotate.genes.with.gos(c("STAT3","STAT2","IGF1R","EGFR"),c("GO:0003677","GO:0005515","GO:0043559"),with.descendants=T) | |
# Let's display a wordle to get specific annotations (Can take several seconds and spit warnings for lengthy go terms) | |
gocloud.for.genes("STAT3") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Go terms annotation ------ | |
require(GO.db) | |
# For word cloud generation (can be commented if wordclouds are not used) | |
library(tm) | |
library(wordcloud) | |
# TODO: generalize so that we are specie agnostic. Requires to pass a param to the package? | |
# Uncomment for Mouse DB | |
# require(org.Mm.eg.db) # generalize | |
# annotation.db<-org.Mm.eg | |
# annotation.dbSYMBOL<-org.Mm.egSYMBOL | |
# annotation.egGO<-org.Mm.egGO | |
# Uncomment for Human DB | |
require(org.Hs.eg.db) | |
annotation.db<-org.Hs.eg | |
annotation.dbSYMBOL<-org.Hs.egSYMBOL | |
annotation.egGO<-org.Hs.egGO | |
gene2ez=revmap(annotation.dbSYMBOL) | |
gene2go<- function(gene){ | |
if(!exists(gene,gene2ez)) return(NULL) | |
ez=get(gene,gene2ez) | |
gos=get(ez,annotation.egGO) | |
if(length(gos)==0 || is.na(gos)) return(c()) | |
goids=lapply(gos,"[[","GOID") | |
sapply(lapply(goids, function(z) get(z, GOTERM)), "Term") | |
} | |
geneWithGo<- function(go){ | |
if(!exists(go,revmap(annotation.egGO))) return(c()) | |
ez.genes=get(go,revmap(annotation.egGO)) | |
unlist(mget(ez.genes,annotation.dbSYMBOL)) | |
} | |
geneHasAnnotation<- function(gene,term){ | |
genes=unlist(strsplit(gene,"+",fixed=T)) | |
for(gene in genes){ | |
terms=gene2go(gene) | |
if(length(grep(term,terms))>0) return(T) | |
} | |
return(F) | |
} | |
geneHasAnnotations<- function(gene,terms=c()){ | |
# If transcript info present, remove it | |
gene=strsplit(gene," ",fixed=T)[[1]][1] | |
genes=unlist(strsplit(gene,"+",fixed=T)) | |
res=list() | |
for(term in terms){ | |
res[term]=F | |
} | |
for(gene in genes){ | |
gene.terms=gene2go(gene) | |
for(term in terms){ | |
if(length(grep(term,gene.terms))>0){ | |
res[term]=T | |
} | |
} | |
} | |
return(as.data.frame(res,row.names=c(gene))) | |
} | |
# go.descendants <- function(go){ | |
# visited=c() | |
# to.visit=c(go) | |
# while(length(to.visit)>0 && !is.na(to.visit)){ | |
# visited=union(visited,to.visit) | |
# children<-unique(unlist(mget(to.visit,GOBPCHILDREN))) | |
# children<-children[complete.cases(children)] | |
# to.visit=setdiff(children,visited) | |
# } | |
# return(visited) | |
# } | |
# More efficient | |
go.descendants <- function(go,onto=c("BP","MF","CC")){ | |
onto=match.arg(onto) | |
if(onto=="BP"){ | |
onto=GOBPOFFSPRING | |
}else if(onto=="MF"){ | |
onto=GOMFOFFSPRING | |
}else if(onto=="CC"){ | |
onto=GOCCOFFSPRING | |
} | |
children<-unique(unlist(mget(go,onto,ifnotfound=NA))) | |
children=union(children,go) | |
return(children[complete.cases(children)]) | |
} | |
go2terms <- function(gos){ | |
sapply(lapply(gos, function(z) get(z, GOTERM)), "Term") | |
# Maybe better? | |
# unlist(sapply(mget(go.descendants("GO:0072539"),GOTERM),function(z) Term(z))) | |
} | |
go2genes <- function(go,with.descendants=FALSE){ | |
genes=c() | |
if(with.descendants){ | |
#Check if recursive | |
goids<-go.descendants(go) | |
}else{ | |
goids<-c(go) | |
} | |
ezgenes<-unique(unlist(mget(goids,revmap(annotation.egGO),ifnotfound=NA))) | |
ezgenes<-ezgenes[!is.na(ezgenes)] | |
symbols<-unique(unlist(mget(ezgenes,annotation.dbSYMBOL))) | |
return(symbols) | |
} | |
gene2gos <- function(gene,flatten=T){ | |
ezgenes<-unique(unlist(mget(gene,gene2ez,ifnotfound=NA))) | |
ezgenes<-ezgenes[!is.na(ezgenes)] | |
if(flatten){ | |
gos<-unlist(mget(ezgenes,annotation.egGO),recursive=F,use.names=F) | |
goids=unique(sapply(gos,"[[","GOID")) | |
return(goids) | |
}else{ | |
res<-melt(mget(ezgenes,annotation.egGO)) | |
res<-res[res$"L3"=="GOID",] | |
res["gene"]=unlist(mget(res$"L1",annotation.dbSYMBOL)) | |
res["term"]=go2terms(as.character(res$L2)) | |
res<-res[,c("L1","gene","L2","term")] | |
colnames(res)<-c("ezgene","gene","GO","term") | |
rownames(res)<-NULL | |
return(unique(res)) | |
} | |
} | |
annotate.genes.with.gos<- function(genes,gos,with.descendants=F,use.descendants=F,go.terms=F){ | |
## Value | |
## Returns a logical matrix of genes * terms | |
## If with.descendants, then the set of go terms is expanded to include the descendants | |
## If use.descendants, then genes are associated with a go term g if they are associated with any of its descendants | |
#get the genes of all the go terms | |
if(with.descendants){ | |
gos<-go.descendants(gos) | |
} | |
if(use.descendants){ | |
geneByGo<-sapply(gos,function(go) go2genes(go,with.descendants=T)) | |
}else{ | |
ezgenes=mget(gos,revmap(annotation.egGO),ifnotfound=NA) | |
geneByGo<-lapply(ezgenes,function (z){ | |
if(length(z)==1 && is.na(z)) return(c()) | |
unique(unlist(mget(z,annotation.dbSYMBOL))) | |
}) | |
} | |
genes.annot<-t(unlist(sapply(genes,function(gene){ | |
unlist(lapply(geneByGo,function(golist){ | |
gene<-strsplit(gene,"_",fixed=T)[[1]][1] | |
length(intersect(unlist(strsplit(gene,"+",fixed=T)),golist)) | |
})) | |
}))) | |
if(go.terms){ | |
colnames(genes.annot)<-go2terms(colnames(genes.annot)) | |
} | |
return(genes.annot) | |
} | |
# go2terms(gene2gos(c("Stat3"))) | |
gotable.for.genes <- function(genes){ | |
some.gos<-gene2gos(genes) | |
some.gos.terms<-go2terms(some.gos) | |
some.gos.lengths<-unlist(lapply(some.gos,function(go) length(go2genes(go,with.descendants=T)))) | |
this.gos<-data.table(gos=some.gos,terms=some.gos.terms,lengths=some.gos.lengths) | |
max.freq=max(this.gos[,lengths]) | |
max.freq=9266 | |
this.gos[,specificity:=min(c(round(max.freq/lengths),1000)),by=gos] | |
return(this.gos) | |
} | |
gocloud.for.genes <- function(genes,go.table=NULL,rot.per=0.15,scale=c(1.3,.5)){ | |
if(is.null(go.table)){ | |
go.table<-gotable.for.genes(genes) | |
} | |
#go.table[,specificity:=min(c(round(max.freq/lengths),100)),by=gos] | |
go.table[,terms:=gsub("positive","pos.",terms,ignore.case=T,fixed=F)] | |
go.table[,terms:=gsub("negative","neg.",terms,ignore.case=T,fixed=F)] | |
go.table[,terms:=gsub("protein","prot.",terms,ignore.case=T,fixed=F)] | |
go.table[,terms:=gsub("kinase","kin.",terms,ignore.case=T,fixed=F)] | |
go.table[,terms:=gsub("signaling","signal.",terms,ignore.case=T,fixed=F)] | |
go.table[,terms:=gsub("cellular","c.",terms,ignore.case=T,fixed=F)] | |
go.table[,terms:=gsub("cell","c.",terms,ignore.case=T,fixed=F)] | |
go.table[,terms:=gsub("reactive oxygen species","ROS",terms,ignore.case=T,fixed=F)] | |
go.table[,terms:=gsub("metabolic","metab.",terms,ignore.case=T,fixed=F)] | |
go.table[,terms:=gsub("metabolism","metab.",terms,ignore.case=T,fixed=F)] | |
go.table[,terms:=gsub("process","proc.",terms,ignore.case=T,fixed=F)] | |
go.table[,terms:=gsub("oxygen","O2",terms,ignore.case=T,fixed=F)] | |
go.table[,terms:=gsub("production","product.",terms,ignore.case=T,fixed=F)] | |
go.table[,terms:=gsub("regulation","reg.",terms,ignore.case=T,fixed=F)] | |
go.table[,terms:=gsub("activity","actvty.",terms,ignore.case=T,fixed=F)] | |
go.table[,terms:=gsub("response","resp.",terms,ignore.case=T,fixed=F)] | |
go.table[,terms:=gsub("electron","e-",terms,ignore.case=T,fixed=F)] | |
go.table[,terms:=gsub("endoplasmic","endop.",terms,ignore.case=T,fixed=F)] | |
go.table[,terms:=gsub("transcription factor","TF.",terms,ignore.case=T,fixed=F)] | |
go.table[,terms:=gsub("transcription","transcr.",terms,ignore.case=T,fixed=F)] | |
go.table[,terms:=gsub("retinoic acid","RA",terms,ignore.case=T,fixed=F)] | |
go.table[,terms:=gsub("interleukin","IL",terms,ignore.case=T,fixed=F)] | |
go.table[,terms:=gsub("interferon","IFN",terms,ignore.case=T,fixed=F)] | |
go.table[,terms:=gsub("tumor necrosis factor","TNF",terms,ignore.case=T,fixed=F)] | |
go.table[,terms:=gsub("aryl hydrocarbon receptor","AHR",terms,ignore.case=T,fixed=F)] | |
go.table[,terms:=gsub("complex","cplx.",terms,ignore.case=T,fixed=F)] | |
go.table[,terms:=gsub("proliferation","prolif.",terms,ignore.case=T,fixed=F)] | |
go.table[,terms:=gsub("nuclear","nucl.",terms,ignore.case=T,fixed=F)] | |
go.table[,terms:=gsub("response","resp.",terms,ignore.case=T,fixed=F)] | |
go.table<-go.table[order(specificity,decreasing=T)] | |
pal2 <- brewer.pal(8,"Dark2") | |
# Shorten some GO terms | |
wordcloud(go.table$terms,go.table$specificity, scale=scale,max.words=50, random.color=F,random.order=FALSE, rot.per=rot.per,colors=pal2,min.freq=10) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment