avrilcoghlan · January 28, 2014 17:05
diff --git a/calc_dist_to_top_of_GO.py b/calc_dist_to_top_of_GO.py
 import sys
 import os
 from collections import defaultdict
 from scipy.sparse import lil_matrix # needed for Dijkstra's algorithm
 from scipy.sparse.csgraph import dijkstra # needed for Dijkstra's algorithm

 class Error (Exception): pass

 #====================================================================#

 # define a function to read in the ancestors of each GO term in the GO hierarchy:

 def read_go_ancestors(go_desc_file):
    
    # first read in the input GO file, and make a list of all the GO terms, and the
    # terms above them in the GO hierarchy:
    # eg.
    # [Term]
    # id: GO:0004835
    terms = [] # list of GO terms
    ancestors = defaultdict(list) # ancestors of a particular GO term, in the hierarchy
    take = 0
    fileObj = open(go_desc_file, "r")
    for line in fileObj:  
        line = line.rstrip()
        temp = line.split()
        if len(temp) == 1:
           if temp[0] == '[Term]':
               take = 1 
        elif len(temp) >= 2 and take == 1:
            if temp[0] == 'id:':
                go = temp[1]
                terms.append(go) # append this to our list of terms
            elif temp[0] == 'is_a:': # eg. is_a: GO:0048308 ! organelle inheritance
                ancestor = temp[1]
                ancestors[go].append(ancestor)
        elif len(temp) == 0:
            take = 0 
    fileObj.close()
    
    return(ancestors, terms)
    
 #====================================================================#

 # define a function to make the graph of GO terms, ie. create the matrix 
 # containing the graph structure for running Dijkstra's algorithm:

 def create_go_graph_structure(ancestors, terms):

    num_terms = len(terms)
    # create a list containing num_terms lists initialised to 0:
    # OPTION 1 (use numpy): 
    # matrix = numpy.zeros( (num_terms, num_terms), "d")
    # OPTION 2 (use scipy.sparse, apparently lil_matrix is good for building up a matrix iteratively):
    matrix = lil_matrix( (num_terms, num_terms), dtype="float64")
    
    # for each GO term X, make an edge to each term Y above it in the GO hierarchy:
    # (there may be several GO terms above a particular term X in the GO hierarchy)
    for go in terms:
        if go in ancestors: # if there are ancestors defined for this go term
            go_index = terms.index(go)
            go_ancestors = ancestors[go]
            for ancestor in go_ancestors:
                ancestor_index = terms.index(ancestor)
                matrix[go_index, ancestor_index] = 1
    # Note:  not all terms have ancestors defined, eg. the three terms at the top of the hierarcy,
    # and obsolete terms, eg. GO:0000005

    return(matrix)
 
 #====================================================================#

 # define a function to run Dijkstra's algorithm between each GO term and the three terms 
 # at the top of the hierarchy (biological process, molecular function, cellular component):

 def run_dijkstras_for_terms(matrix, terms):
    
    # create a dictionary to store the distance from each term to the top of the GO hierarchy:
    distance = defaultdict(lambda: 1e+10) 
    
    # find the index in list 'terms' for each of the three terms at the top of the GO
    # hierarchy:
    ends = []
    # Note: not all the top 3 GO terms are in 'terms' for the test cases:
    if 'GO:0003674' in terms:
        end1_index = terms.index('GO:0003674') # molecular function
        ends.append(end1_index)
    if 'GO:0005575' in terms:
        end2_index = terms.index('GO:0005575') # cellular component
        ends.append(end2_index)
    if 'GO:0008150' in terms:
        end3_index = terms.index('GO:0008150') # biological process
        ends.append(end3_index)
    
    for go in terms:
        term_index = terms.index(go)
        # run Dijkstra's algorithm, starting at index term_index
        distances, predecessors = dijkstra(matrix, indices=term_index, return_predecessors=True, directed=True)
        # find the distance to each of the three terms at the top of the hierarchy
        min_dist = 1e+10
        for end_index in ends:
            dist = distances[end_index]
            if dist < min_dist:
                min_dist = dist
        assert(go not in distance)
        distance[go] = min_dist
    
    return distance
    
 #====================================================================#

 def main():
    
    # check the command-line arguments:
    if len(sys.argv) != 2 or os.path.exists(sys.argv[1]) == False:
        print("Usage: %s go_desc_file" % sys.argv[0]) 
        sys.exit(1)
    go_desc_file = sys.argv[1]

    # read in the ancestors of each GO term in the GO hierarchy:
    (ancestors, terms) = read_go_ancestors(go_desc_file)  
    
    # make the graph of GO terms, ie. create the matrix containing the graph structure for 
    # running Dijkstra's algorithm:
    go_matrix = create_go_graph_structure(ancestors, terms)
    
    # run Dijkstra's algorithm between each GO term and the three terms at the top of the hierarchy
    # (biological process, molecular function, cellular component):
    distance = run_dijkstras_for_terms(go_matrix, terms)
    for go, dist in distance.items():
        print("%s %d" % (go, dist))
        # Note: Noel pointed out that I could have calculated the distance to the top of the hierarchy
        # using a simpler approach (see calc_dists_to_top_of_GO_using_bfs.py). 

 #====================================================================#

 if __name__=="__main__":
    main()

 #====================================================================#
	import sys
	import os
	from collections import defaultdict
	from scipy.sparse import lil_matrix # needed for Dijkstra's algorithm
	from scipy.sparse.csgraph import dijkstra # needed for Dijkstra's algorithm

	class Error (Exception): pass

	#====================================================================#

	# define a function to read in the ancestors of each GO term in the GO hierarchy:

	def read_go_ancestors(go_desc_file):

	# first read in the input GO file, and make a list of all the GO terms, and the
	# terms above them in the GO hierarchy:
	# eg.
	# [Term]
	# id: GO:0004835
	terms = [] # list of GO terms
	ancestors = defaultdict(list) # ancestors of a particular GO term, in the hierarchy
	take = 0
	fileObj = open(go_desc_file, "r")
	for line in fileObj:
	line = line.rstrip()
	temp = line.split()
	if len(temp) == 1:
	if temp[0] == '[Term]':
	take = 1
	elif len(temp) >= 2 and take == 1:
	if temp[0] == 'id:':
	go = temp[1]
	terms.append(go) # append this to our list of terms
	elif temp[0] == 'is_a:': # eg. is_a: GO:0048308 ! organelle inheritance
	ancestor = temp[1]
	ancestors[go].append(ancestor)
	elif len(temp) == 0:
	take = 0
	fileObj.close()

	return(ancestors, terms)

	#====================================================================#

	# define a function to make the graph of GO terms, ie. create the matrix
	# containing the graph structure for running Dijkstra's algorithm:

	def create_go_graph_structure(ancestors, terms):

	num_terms = len(terms)
	# create a list containing num_terms lists initialised to 0:
	# OPTION 1 (use numpy):
	# matrix = numpy.zeros( (num_terms, num_terms), "d")
	# OPTION 2 (use scipy.sparse, apparently lil_matrix is good for building up a matrix iteratively):
	matrix = lil_matrix( (num_terms, num_terms), dtype="float64")

	# for each GO term X, make an edge to each term Y above it in the GO hierarchy:
	# (there may be several GO terms above a particular term X in the GO hierarchy)
	for go in terms:
	if go in ancestors: # if there are ancestors defined for this go term
	go_index = terms.index(go)
	go_ancestors = ancestors[go]
	for ancestor in go_ancestors:
	ancestor_index = terms.index(ancestor)
	matrix[go_index, ancestor_index] = 1
	# Note: not all terms have ancestors defined, eg. the three terms at the top of the hierarcy,
	# and obsolete terms, eg. GO:0000005

	return(matrix)

	#====================================================================#

	# define a function to run Dijkstra's algorithm between each GO term and the three terms
	# at the top of the hierarchy (biological process, molecular function, cellular component):

	def run_dijkstras_for_terms(matrix, terms):

	# create a dictionary to store the distance from each term to the top of the GO hierarchy:
	distance = defaultdict(lambda: 1e+10)

	# find the index in list 'terms' for each of the three terms at the top of the GO
	# hierarchy:
	ends = []
	# Note: not all the top 3 GO terms are in 'terms' for the test cases:
	if 'GO:0003674' in terms:
	end1_index = terms.index('GO:0003674') # molecular function
	ends.append(end1_index)
	if 'GO:0005575' in terms:
	end2_index = terms.index('GO:0005575') # cellular component
	ends.append(end2_index)
	if 'GO:0008150' in terms:
	end3_index = terms.index('GO:0008150') # biological process
	ends.append(end3_index)

	for go in terms:
	term_index = terms.index(go)
	# run Dijkstra's algorithm, starting at index term_index
	distances, predecessors = dijkstra(matrix, indices=term_index, return_predecessors=True, directed=True)
	# find the distance to each of the three terms at the top of the hierarchy
	min_dist = 1e+10
	for end_index in ends:
	dist = distances[end_index]
	if dist < min_dist:
	min_dist = dist
	assert(go not in distance)
	distance[go] = min_dist

	return distance

	#====================================================================#

	def main():

	# check the command-line arguments:
	if len(sys.argv) != 2 or os.path.exists(sys.argv[1]) == False:
	print("Usage: %s go_desc_file" % sys.argv[0])
	sys.exit(1)
	go_desc_file = sys.argv[1]

	# read in the ancestors of each GO term in the GO hierarchy:
	(ancestors, terms) = read_go_ancestors(go_desc_file)

	# make the graph of GO terms, ie. create the matrix containing the graph structure for
	# running Dijkstra's algorithm:
	go_matrix = create_go_graph_structure(ancestors, terms)

	# run Dijkstra's algorithm between each GO term and the three terms at the top of the hierarchy
	# (biological process, molecular function, cellular component):
	distance = run_dijkstras_for_terms(go_matrix, terms)
	for go, dist in distance.items():
	print("%s %d" % (go, dist))
	# Note: Noel pointed out that I could have calculated the distance to the top of the hierarchy
	# using a simpler approach (see calc_dists_to_top_of_GO_using_bfs.py).

	#====================================================================#

	if __name__=="__main__":
	main()

	#====================================================================#