Created
January 28, 2014 17:05
-
-
Save avrilcoghlan/8671775 to your computer and use it in GitHub Desktop.
Python script to calculate the number of steps from a GO term to the top of the GO hierarchy, using Dijkstra's algorithm
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import os | |
from collections import defaultdict | |
from scipy.sparse import lil_matrix # needed for Dijkstra's algorithm | |
from scipy.sparse.csgraph import dijkstra # needed for Dijkstra's algorithm | |
class Error (Exception): pass | |
#====================================================================# | |
# define a function to read in the ancestors of each GO term in the GO hierarchy: | |
def read_go_ancestors(go_desc_file): | |
# first read in the input GO file, and make a list of all the GO terms, and the | |
# terms above them in the GO hierarchy: | |
# eg. | |
# [Term] | |
# id: GO:0004835 | |
terms = [] # list of GO terms | |
ancestors = defaultdict(list) # ancestors of a particular GO term, in the hierarchy | |
take = 0 | |
fileObj = open(go_desc_file, "r") | |
for line in fileObj: | |
line = line.rstrip() | |
temp = line.split() | |
if len(temp) == 1: | |
if temp[0] == '[Term]': | |
take = 1 | |
elif len(temp) >= 2 and take == 1: | |
if temp[0] == 'id:': | |
go = temp[1] | |
terms.append(go) # append this to our list of terms | |
elif temp[0] == 'is_a:': # eg. is_a: GO:0048308 ! organelle inheritance | |
ancestor = temp[1] | |
ancestors[go].append(ancestor) | |
elif len(temp) == 0: | |
take = 0 | |
fileObj.close() | |
return(ancestors, terms) | |
#====================================================================# | |
# define a function to make the graph of GO terms, ie. create the matrix | |
# containing the graph structure for running Dijkstra's algorithm: | |
def create_go_graph_structure(ancestors, terms): | |
num_terms = len(terms) | |
# create a list containing num_terms lists initialised to 0: | |
# OPTION 1 (use numpy): | |
# matrix = numpy.zeros( (num_terms, num_terms), "d") | |
# OPTION 2 (use scipy.sparse, apparently lil_matrix is good for building up a matrix iteratively): | |
matrix = lil_matrix( (num_terms, num_terms), dtype="float64") | |
# for each GO term X, make an edge to each term Y above it in the GO hierarchy: | |
# (there may be several GO terms above a particular term X in the GO hierarchy) | |
for go in terms: | |
if go in ancestors: # if there are ancestors defined for this go term | |
go_index = terms.index(go) | |
go_ancestors = ancestors[go] | |
for ancestor in go_ancestors: | |
ancestor_index = terms.index(ancestor) | |
matrix[go_index, ancestor_index] = 1 | |
# Note: not all terms have ancestors defined, eg. the three terms at the top of the hierarcy, | |
# and obsolete terms, eg. GO:0000005 | |
return(matrix) | |
#====================================================================# | |
# define a function to run Dijkstra's algorithm between each GO term and the three terms | |
# at the top of the hierarchy (biological process, molecular function, cellular component): | |
def run_dijkstras_for_terms(matrix, terms): | |
# create a dictionary to store the distance from each term to the top of the GO hierarchy: | |
distance = defaultdict(lambda: 1e+10) | |
# find the index in list 'terms' for each of the three terms at the top of the GO | |
# hierarchy: | |
ends = [] | |
# Note: not all the top 3 GO terms are in 'terms' for the test cases: | |
if 'GO:0003674' in terms: | |
end1_index = terms.index('GO:0003674') # molecular function | |
ends.append(end1_index) | |
if 'GO:0005575' in terms: | |
end2_index = terms.index('GO:0005575') # cellular component | |
ends.append(end2_index) | |
if 'GO:0008150' in terms: | |
end3_index = terms.index('GO:0008150') # biological process | |
ends.append(end3_index) | |
for go in terms: | |
term_index = terms.index(go) | |
# run Dijkstra's algorithm, starting at index term_index | |
distances, predecessors = dijkstra(matrix, indices=term_index, return_predecessors=True, directed=True) | |
# find the distance to each of the three terms at the top of the hierarchy | |
min_dist = 1e+10 | |
for end_index in ends: | |
dist = distances[end_index] | |
if dist < min_dist: | |
min_dist = dist | |
assert(go not in distance) | |
distance[go] = min_dist | |
return distance | |
#====================================================================# | |
def main(): | |
# check the command-line arguments: | |
if len(sys.argv) != 2 or os.path.exists(sys.argv[1]) == False: | |
print("Usage: %s go_desc_file" % sys.argv[0]) | |
sys.exit(1) | |
go_desc_file = sys.argv[1] | |
# read in the ancestors of each GO term in the GO hierarchy: | |
(ancestors, terms) = read_go_ancestors(go_desc_file) | |
# make the graph of GO terms, ie. create the matrix containing the graph structure for | |
# running Dijkstra's algorithm: | |
go_matrix = create_go_graph_structure(ancestors, terms) | |
# run Dijkstra's algorithm between each GO term and the three terms at the top of the hierarchy | |
# (biological process, molecular function, cellular component): | |
distance = run_dijkstras_for_terms(go_matrix, terms) | |
for go, dist in distance.items(): | |
print("%s %d" % (go, dist)) | |
# Note: Noel pointed out that I could have calculated the distance to the top of the hierarchy | |
# using a simpler approach (see calc_dists_to_top_of_GO_using_bfs.py). | |
#====================================================================# | |
if __name__=="__main__": | |
main() | |
#====================================================================# | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment