massyah · December 19, 2015 22:48
diff --git a/example_go_annotations.R b/example_go_annotations.R
 source("go_annotations.R")

 # Get the GO BP/MF/CC terms ids associated with STAT3  
 gene2gos("STAT3")
 # Get the GO BP/MF/CC terms description associated with STAT3  
 go2terms(gene2gos("STAT3"))

 # union of go terms for a list of genes 
 go2terms(gene2gos(c("STAT3","STAT2")))

 # To select go terms of interest, either go to AMIGO, or build a mapping from terms to GO ID
 # This is very slow, has to be done once 

 go.terms <- as.data.frame(GOTERM)
 unique(go.terms[grep("DNA binding",go.terms$Term),c("go_id","Term")])


 # Once we know the ID of genes of interest, let's annotate a list of genes with them
 # We consider a gene to be associated with a GO term if it's associated with it or with any of its descendants 
 # For the exemple, we chose: 
 # DNA binding corresponds to GO:0003677, 
 # Protein binding to GO:0005515
 # Insuling binding to GO:0043559


 annotate.genes.with.gos(c("STAT3","STAT2","IGF1R","EGFR"),c("GO:0003677","GO:0005515","GO:0043559"),with.descendants=T)

 # Let's display a wordle to get specific annotations (Can take several seconds and spit warnings for lengthy go terms)
 gocloud.for.genes("STAT3")
diff --git a/go_annotations.R b/go_annotations.R


 # Go terms annotation ------ 
 require(GO.db)

 # For word cloud generation (can be commented if wordclouds are not used) 
 library(tm)
 library(wordcloud)

 # TODO: generalize so that we are specie agnostic. Requires to pass a param to the package? 


 # Uncomment for Mouse DB
 # require(org.Mm.eg.db)  # generalize
 # annotation.db<-org.Mm.eg
 # annotation.dbSYMBOL<-org.Mm.egSYMBOL
 # annotation.egGO<-org.Mm.egGO

 # Uncomment for Human DB
 require(org.Hs.eg.db)  
 annotation.db<-org.Hs.eg
 annotation.dbSYMBOL<-org.Hs.egSYMBOL
 annotation.egGO<-org.Hs.egGO
 gene2ez=revmap(annotation.dbSYMBOL)


 gene2go<- function(gene){
  if(!exists(gene,gene2ez)) return(NULL)
 	ez=get(gene,gene2ez)
 	gos=get(ez,annotation.egGO)
     if(length(gos)==0 || is.na(gos)) return(c())
 	goids=lapply(gos,"[[","GOID")
 	sapply(lapply(goids, function(z) get(z, GOTERM)), "Term")
 }

 geneWithGo<- function(go){
  if(!exists(go,revmap(annotation.egGO))) return(c())
  ez.genes=get(go,revmap(annotation.egGO))
  unlist(mget(ez.genes,annotation.dbSYMBOL))
 }

 geneHasAnnotation<- function(gene,term){
 	genes=unlist(strsplit(gene,"+",fixed=T))
 	for(gene in genes){
 		terms=gene2go(gene)
 		if(length(grep(term,terms))>0) return(T)
 	}
 	return(F)
 }

 geneHasAnnotations<- function(gene,terms=c()){
  # If transcript info present, remove it
  gene=strsplit(gene," ",fixed=T)[[1]][1]
 	genes=unlist(strsplit(gene,"+",fixed=T))
 	res=list()
 	for(term in terms){
 		res[term]=F
 	}
 	for(gene in genes){
 		gene.terms=gene2go(gene)
 		for(term in terms){
 			if(length(grep(term,gene.terms))>0){
 				res[term]=T
 			}
 		}
 	}
 	return(as.data.frame(res,row.names=c(gene)))
 }



 # go.descendants <- function(go){
 #   visited=c()
 #   to.visit=c(go)
 #   while(length(to.visit)>0 && !is.na(to.visit)){
 #   	visited=union(visited,to.visit)
 #   	children<-unique(unlist(mget(to.visit,GOBPCHILDREN)))
 #   	children<-children[complete.cases(children)]
 #   	to.visit=setdiff(children,visited)
 #   }
 #   return(visited)
 # }

 # More efficient
 go.descendants <- function(go,onto=c("BP","MF","CC")){
  onto=match.arg(onto)
  if(onto=="BP"){
    onto=GOBPOFFSPRING
  }else if(onto=="MF"){
    onto=GOMFOFFSPRING
  }else if(onto=="CC"){
    onto=GOCCOFFSPRING
  }
 	children<-unique(unlist(mget(go,onto,ifnotfound=NA)))
 	children=union(children,go)
 	return(children[complete.cases(children)])
 }


 go2terms <- function(gos){
 	
  sapply(lapply(gos, function(z) get(z, GOTERM)), "Term")
  # Maybe better?
  # unlist(sapply(mget(go.descendants("GO:0072539"),GOTERM),function(z) Term(z)))
 }


 go2genes <- function(go,with.descendants=FALSE){
  genes=c()
  if(with.descendants){
    #Check if recursive
    goids<-go.descendants(go)
  }else{
    goids<-c(go)
  }
  ezgenes<-unique(unlist(mget(goids,revmap(annotation.egGO),ifnotfound=NA)))
  ezgenes<-ezgenes[!is.na(ezgenes)]
  symbols<-unique(unlist(mget(ezgenes,annotation.dbSYMBOL)))
  return(symbols)
 }


 gene2gos <- function(gene,flatten=T){
  ezgenes<-unique(unlist(mget(gene,gene2ez,ifnotfound=NA)))
  ezgenes<-ezgenes[!is.na(ezgenes)]
  if(flatten){
 	  gos<-unlist(mget(ezgenes,annotation.egGO),recursive=F,use.names=F)
 	  goids=unique(sapply(gos,"[[","GOID"))
 	  return(goids)
  }else{
  	res<-melt(mget(ezgenes,annotation.egGO))
  	res<-res[res$"L3"=="GOID",]
 	res["gene"]=unlist(mget(res$"L1",annotation.dbSYMBOL))
 	res["term"]=go2terms(as.character(res$L2))
  	res<-res[,c("L1","gene","L2","term")]
  	colnames(res)<-c("ezgene","gene","GO","term")
  	rownames(res)<-NULL
  	return(unique(res))
  }
 }

 annotate.genes.with.gos<- function(genes,gos,with.descendants=F,use.descendants=F,go.terms=F){
 	## Value
 	## Returns a logical matrix of genes * terms 
 	## If with.descendants, then the set of go terms is expanded to include the descendants
 	## If use.descendants, then genes are associated with a go term g if they are associated with any of its descendants
 	#get the genes of all the go terms
 	if(with.descendants){
 		gos<-go.descendants(gos)
 	}
 	if(use.descendants){
 		geneByGo<-sapply(gos,function(go) go2genes(go,with.descendants=T))
 	}else{
 		ezgenes=mget(gos,revmap(annotation.egGO),ifnotfound=NA)
 		geneByGo<-lapply(ezgenes,function (z){
 			if(length(z)==1 && is.na(z)) return(c())
 			unique(unlist(mget(z,annotation.dbSYMBOL)))
 			})
 	}

 	genes.annot<-t(unlist(sapply(genes,function(gene){
 		unlist(lapply(geneByGo,function(golist){
 			gene<-strsplit(gene,"_",fixed=T)[[1]][1]
 			length(intersect(unlist(strsplit(gene,"+",fixed=T)),golist))
 			}))
 	})))
 	if(go.terms){
 		colnames(genes.annot)<-go2terms(colnames(genes.annot))
 	}
 	return(genes.annot)
 }

 # go2terms(gene2gos(c("Stat3")))

 gotable.for.genes <- function(genes){
 	some.gos<-gene2gos(genes)
 	some.gos.terms<-go2terms(some.gos)
 	some.gos.lengths<-unlist(lapply(some.gos,function(go) length(go2genes(go,with.descendants=T))))
 	this.gos<-data.table(gos=some.gos,terms=some.gos.terms,lengths=some.gos.lengths)
 	max.freq=max(this.gos[,lengths])
 	max.freq=9266
 	this.gos[,specificity:=min(c(round(max.freq/lengths),1000)),by=gos]
 	return(this.gos)
 }

 gocloud.for.genes <- function(genes,go.table=NULL,rot.per=0.15,scale=c(1.3,.5)){
 	if(is.null(go.table)){
 		go.table<-gotable.for.genes(genes)
 	}
 	#go.table[,specificity:=min(c(round(max.freq/lengths),100)),by=gos]
 	go.table[,terms:=gsub("positive","pos.",terms,ignore.case=T,fixed=F)]
 	go.table[,terms:=gsub("negative","neg.",terms,ignore.case=T,fixed=F)]
 	go.table[,terms:=gsub("protein","prot.",terms,ignore.case=T,fixed=F)]
 	go.table[,terms:=gsub("kinase","kin.",terms,ignore.case=T,fixed=F)]
 	go.table[,terms:=gsub("signaling","signal.",terms,ignore.case=T,fixed=F)]
 	go.table[,terms:=gsub("cellular","c.",terms,ignore.case=T,fixed=F)]
 	go.table[,terms:=gsub("cell","c.",terms,ignore.case=T,fixed=F)]
 	go.table[,terms:=gsub("reactive oxygen species","ROS",terms,ignore.case=T,fixed=F)]
 	go.table[,terms:=gsub("metabolic","metab.",terms,ignore.case=T,fixed=F)]
 	go.table[,terms:=gsub("metabolism","metab.",terms,ignore.case=T,fixed=F)]
 	go.table[,terms:=gsub("process","proc.",terms,ignore.case=T,fixed=F)]
 	go.table[,terms:=gsub("oxygen","O2",terms,ignore.case=T,fixed=F)]
 	go.table[,terms:=gsub("production","product.",terms,ignore.case=T,fixed=F)]
 	go.table[,terms:=gsub("regulation","reg.",terms,ignore.case=T,fixed=F)]
 	go.table[,terms:=gsub("activity","actvty.",terms,ignore.case=T,fixed=F)]
 	go.table[,terms:=gsub("response","resp.",terms,ignore.case=T,fixed=F)]
 	go.table[,terms:=gsub("electron","e-",terms,ignore.case=T,fixed=F)]
 	go.table[,terms:=gsub("endoplasmic","endop.",terms,ignore.case=T,fixed=F)]
 	go.table[,terms:=gsub("transcription factor","TF.",terms,ignore.case=T,fixed=F)]
 	go.table[,terms:=gsub("transcription","transcr.",terms,ignore.case=T,fixed=F)]
 	go.table[,terms:=gsub("retinoic acid","RA",terms,ignore.case=T,fixed=F)]
 	go.table[,terms:=gsub("interleukin","IL",terms,ignore.case=T,fixed=F)]
 	go.table[,terms:=gsub("interferon","IFN",terms,ignore.case=T,fixed=F)]
 	go.table[,terms:=gsub("tumor necrosis factor","TNF",terms,ignore.case=T,fixed=F)]
 	go.table[,terms:=gsub("aryl hydrocarbon receptor","AHR",terms,ignore.case=T,fixed=F)]
 	go.table[,terms:=gsub("complex","cplx.",terms,ignore.case=T,fixed=F)]
 	go.table[,terms:=gsub("proliferation","prolif.",terms,ignore.case=T,fixed=F)]
 	go.table[,terms:=gsub("nuclear","nucl.",terms,ignore.case=T,fixed=F)]
 	go.table[,terms:=gsub("response","resp.",terms,ignore.case=T,fixed=F)]
 	go.table<-go.table[order(specificity,decreasing=T)]
 	pal2 <- brewer.pal(8,"Dark2")
 	# Shorten some GO terms 
 	wordcloud(go.table$terms,go.table$specificity, scale=scale,max.words=50, random.color=F,random.order=FALSE, rot.per=rot.per,colors=pal2,min.freq=10)
 }
	source("go_annotations.R")

	# Get the GO BP/MF/CC terms ids associated with STAT3
	gene2gos("STAT3")
	# Get the GO BP/MF/CC terms description associated with STAT3
	go2terms(gene2gos("STAT3"))

	# union of go terms for a list of genes
	go2terms(gene2gos(c("STAT3","STAT2")))

	# To select go terms of interest, either go to AMIGO, or build a mapping from terms to GO ID
	# This is very slow, has to be done once

	go.terms <- as.data.frame(GOTERM)
	unique(go.terms[grep("DNA binding",go.terms$Term),c("go_id","Term")])


	# Once we know the ID of genes of interest, let's annotate a list of genes with them
	# We consider a gene to be associated with a GO term if it's associated with it or with any of its descendants
	# For the exemple, we chose:
	# DNA binding corresponds to GO:0003677,
	# Protein binding to GO:0005515
	# Insuling binding to GO:0043559


	annotate.genes.with.gos(c("STAT3","STAT2","IGF1R","EGFR"),c("GO:0003677","GO:0005515","GO:0043559"),with.descendants=T)

	# Let's display a wordle to get specific annotations (Can take several seconds and spit warnings for lengthy go terms)
	gocloud.for.genes("STAT3")


	# Go terms annotation ------
	require(GO.db)

	# For word cloud generation (can be commented if wordclouds are not used)
	library(tm)
	library(wordcloud)

	# TODO: generalize so that we are specie agnostic. Requires to pass a param to the package?


	# Uncomment for Mouse DB
	# require(org.Mm.eg.db) # generalize
	# annotation.db<-org.Mm.eg
	# annotation.dbSYMBOL<-org.Mm.egSYMBOL
	# annotation.egGO<-org.Mm.egGO

	# Uncomment for Human DB
	require(org.Hs.eg.db)
	annotation.db<-org.Hs.eg
	annotation.dbSYMBOL<-org.Hs.egSYMBOL
	annotation.egGO<-org.Hs.egGO
	gene2ez=revmap(annotation.dbSYMBOL)


	gene2go<- function(gene){
	if(!exists(gene,gene2ez)) return(NULL)
	ez=get(gene,gene2ez)
	gos=get(ez,annotation.egGO)
	if(length(gos)==0 \|\| is.na(gos)) return(c())
	goids=lapply(gos,"[[","GOID")
	sapply(lapply(goids, function(z) get(z, GOTERM)), "Term")
	}

	geneWithGo<- function(go){
	if(!exists(go,revmap(annotation.egGO))) return(c())
	ez.genes=get(go,revmap(annotation.egGO))
	unlist(mget(ez.genes,annotation.dbSYMBOL))
	}

	geneHasAnnotation<- function(gene,term){
	genes=unlist(strsplit(gene,"+",fixed=T))
	for(gene in genes){
	terms=gene2go(gene)
	if(length(grep(term,terms))>0) return(T)
	}
	return(F)
	}

	geneHasAnnotations<- function(gene,terms=c()){
	# If transcript info present, remove it
	gene=strsplit(gene," ",fixed=T)[[1]][1]
	genes=unlist(strsplit(gene,"+",fixed=T))
	res=list()
	for(term in terms){
	res[term]=F
	}
	for(gene in genes){
	gene.terms=gene2go(gene)
	for(term in terms){
	if(length(grep(term,gene.terms))>0){
	res[term]=T
	}
	}
	}
	return(as.data.frame(res,row.names=c(gene)))
	}



	# go.descendants <- function(go){
	# visited=c()
	# to.visit=c(go)
	# while(length(to.visit)>0 && !is.na(to.visit)){
	# visited=union(visited,to.visit)
	# children<-unique(unlist(mget(to.visit,GOBPCHILDREN)))
	# children<-children[complete.cases(children)]
	# to.visit=setdiff(children,visited)
	# }
	# return(visited)
	# }

	# More efficient
	go.descendants <- function(go,onto=c("BP","MF","CC")){
	onto=match.arg(onto)
	if(onto=="BP"){
	onto=GOBPOFFSPRING
	}else if(onto=="MF"){
	onto=GOMFOFFSPRING
	}else if(onto=="CC"){
	onto=GOCCOFFSPRING
	}
	children<-unique(unlist(mget(go,onto,ifnotfound=NA)))
	children=union(children,go)
	return(children[complete.cases(children)])
	}


	go2terms <- function(gos){

	sapply(lapply(gos, function(z) get(z, GOTERM)), "Term")
	# Maybe better?
	# unlist(sapply(mget(go.descendants("GO:0072539"),GOTERM),function(z) Term(z)))
	}


	go2genes <- function(go,with.descendants=FALSE){
	genes=c()
	if(with.descendants){
	#Check if recursive
	goids<-go.descendants(go)
	}else{
	goids<-c(go)
	}
	ezgenes<-unique(unlist(mget(goids,revmap(annotation.egGO),ifnotfound=NA)))
	ezgenes<-ezgenes[!is.na(ezgenes)]
	symbols<-unique(unlist(mget(ezgenes,annotation.dbSYMBOL)))
	return(symbols)
	}


	gene2gos <- function(gene,flatten=T){
	ezgenes<-unique(unlist(mget(gene,gene2ez,ifnotfound=NA)))
	ezgenes<-ezgenes[!is.na(ezgenes)]
	if(flatten){
	gos<-unlist(mget(ezgenes,annotation.egGO),recursive=F,use.names=F)
	goids=unique(sapply(gos,"[[","GOID"))
	return(goids)
	}else{
	res<-melt(mget(ezgenes,annotation.egGO))
	res<-res[res$"L3"=="GOID",]
	res["gene"]=unlist(mget(res$"L1",annotation.dbSYMBOL))
	res["term"]=go2terms(as.character(res$L2))
	res<-res[,c("L1","gene","L2","term")]
	colnames(res)<-c("ezgene","gene","GO","term")
	rownames(res)<-NULL
	return(unique(res))
	}
	}

	annotate.genes.with.gos<- function(genes,gos,with.descendants=F,use.descendants=F,go.terms=F){
	## Value
	## Returns a logical matrix of genes * terms
	## If with.descendants, then the set of go terms is expanded to include the descendants
	## If use.descendants, then genes are associated with a go term g if they are associated with any of its descendants
	#get the genes of all the go terms
	if(with.descendants){
	gos<-go.descendants(gos)
	}
	if(use.descendants){
	geneByGo<-sapply(gos,function(go) go2genes(go,with.descendants=T))
	}else{
	ezgenes=mget(gos,revmap(annotation.egGO),ifnotfound=NA)
	geneByGo<-lapply(ezgenes,function (z){
	if(length(z)==1 && is.na(z)) return(c())
	unique(unlist(mget(z,annotation.dbSYMBOL)))
	})
	}

	genes.annot<-t(unlist(sapply(genes,function(gene){
	unlist(lapply(geneByGo,function(golist){
	gene<-strsplit(gene,"_",fixed=T)[[1]][1]
	length(intersect(unlist(strsplit(gene,"+",fixed=T)),golist))
	}))
	})))
	if(go.terms){
	colnames(genes.annot)<-go2terms(colnames(genes.annot))
	}
	return(genes.annot)
	}

	# go2terms(gene2gos(c("Stat3")))

	gotable.for.genes <- function(genes){
	some.gos<-gene2gos(genes)
	some.gos.terms<-go2terms(some.gos)
	some.gos.lengths<-unlist(lapply(some.gos,function(go) length(go2genes(go,with.descendants=T))))
	this.gos<-data.table(gos=some.gos,terms=some.gos.terms,lengths=some.gos.lengths)
	max.freq=max(this.gos[,lengths])
	max.freq=9266
	this.gos[,specificity:=min(c(round(max.freq/lengths),1000)),by=gos]
	return(this.gos)
	}

	gocloud.for.genes <- function(genes,go.table=NULL,rot.per=0.15,scale=c(1.3,.5)){
	if(is.null(go.table)){
	go.table<-gotable.for.genes(genes)
	}
	#go.table[,specificity:=min(c(round(max.freq/lengths),100)),by=gos]
	go.table[,terms:=gsub("positive","pos.",terms,ignore.case=T,fixed=F)]
	go.table[,terms:=gsub("negative","neg.",terms,ignore.case=T,fixed=F)]
	go.table[,terms:=gsub("protein","prot.",terms,ignore.case=T,fixed=F)]
	go.table[,terms:=gsub("kinase","kin.",terms,ignore.case=T,fixed=F)]
	go.table[,terms:=gsub("signaling","signal.",terms,ignore.case=T,fixed=F)]
	go.table[,terms:=gsub("cellular","c.",terms,ignore.case=T,fixed=F)]
	go.table[,terms:=gsub("cell","c.",terms,ignore.case=T,fixed=F)]
	go.table[,terms:=gsub("reactive oxygen species","ROS",terms,ignore.case=T,fixed=F)]
	go.table[,terms:=gsub("metabolic","metab.",terms,ignore.case=T,fixed=F)]
	go.table[,terms:=gsub("metabolism","metab.",terms,ignore.case=T,fixed=F)]
	go.table[,terms:=gsub("process","proc.",terms,ignore.case=T,fixed=F)]
	go.table[,terms:=gsub("oxygen","O2",terms,ignore.case=T,fixed=F)]
	go.table[,terms:=gsub("production","product.",terms,ignore.case=T,fixed=F)]
	go.table[,terms:=gsub("regulation","reg.",terms,ignore.case=T,fixed=F)]
	go.table[,terms:=gsub("activity","actvty.",terms,ignore.case=T,fixed=F)]
	go.table[,terms:=gsub("response","resp.",terms,ignore.case=T,fixed=F)]
	go.table[,terms:=gsub("electron","e-",terms,ignore.case=T,fixed=F)]
	go.table[,terms:=gsub("endoplasmic","endop.",terms,ignore.case=T,fixed=F)]
	go.table[,terms:=gsub("transcription factor","TF.",terms,ignore.case=T,fixed=F)]
	go.table[,terms:=gsub("transcription","transcr.",terms,ignore.case=T,fixed=F)]
	go.table[,terms:=gsub("retinoic acid","RA",terms,ignore.case=T,fixed=F)]
	go.table[,terms:=gsub("interleukin","IL",terms,ignore.case=T,fixed=F)]
	go.table[,terms:=gsub("interferon","IFN",terms,ignore.case=T,fixed=F)]
	go.table[,terms:=gsub("tumor necrosis factor","TNF",terms,ignore.case=T,fixed=F)]
	go.table[,terms:=gsub("aryl hydrocarbon receptor","AHR",terms,ignore.case=T,fixed=F)]
	go.table[,terms:=gsub("complex","cplx.",terms,ignore.case=T,fixed=F)]
	go.table[,terms:=gsub("proliferation","prolif.",terms,ignore.case=T,fixed=F)]
	go.table[,terms:=gsub("nuclear","nucl.",terms,ignore.case=T,fixed=F)]
	go.table[,terms:=gsub("response","resp.",terms,ignore.case=T,fixed=F)]
	go.table<-go.table[order(specificity,decreasing=T)]
	pal2 <- brewer.pal(8,"Dark2")
	# Shorten some GO terms
	wordcloud(go.table$terms,go.table$specificity, scale=scale,max.words=50, random.color=F,random.order=FALSE, rot.per=rot.per,colors=pal2,min.freq=10)
	}