Last active
January 4, 2016 20:09
-
-
Save avrilcoghlan/8671799 to your computer and use it in GitHub Desktop.
Python script to calculate the number of steps from a GO term to the top of the GO hierarchy, using breadth-first search
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import os | |
from collections import defaultdict | |
class Error (Exception): pass | |
#====================================================================# | |
# define a function to read in the ancestors of each GO term in the GO hierarchy: | |
def read_go_ancestors(go_desc_file): | |
# first read in the input GO file, and make a list of all the GO terms, and the | |
# terms above them in the GO hierarchy: | |
# eg. | |
# [Term] | |
# id: GO:0004835 | |
terms = [] # list of GO terms | |
ancestors = defaultdict(list) # ancestors of a particular GO term, in the hierarchy | |
take = 0 | |
fileObj = open(go_desc_file, "r") | |
for line in fileObj: | |
line = line.rstrip() | |
temp = line.split() | |
if len(temp) == 1: | |
if temp[0] == '[Term]': | |
take = 1 | |
elif len(temp) >= 2 and take == 1: | |
if temp[0] == 'id:': | |
go = temp[1] | |
terms.append(go) # append this to our list of terms | |
elif temp[0] == 'is_a:': # eg. is_a: GO:0048308 ! organelle inheritance | |
ancestor = temp[1] | |
ancestors[go].append(ancestor) | |
elif len(temp) == 0: | |
take = 0 | |
fileObj.close() | |
return(ancestors, terms) | |
#====================================================================# | |
# Noel's function for finding the minimum distances from a query node | |
# to ancestor GO nodes. It returns a dictionary result{} that has the | |
# ancestors and the distances to them. This function uses a breadth-first | |
# search of the tree. | |
def BFS_dist_from_node(query_node, parents): | |
"""Return dictionary containing minimum distances of parent GO nodes from the query""" | |
result = {} | |
queue = [] | |
queue.append( (query_node, 0) ) | |
while queue: | |
node, dist = queue.pop(0) # Take the first item off the list 'queue' (and remove it) | |
result[node] = dist | |
if node in parents: # If the node *has* parents | |
for parent in parents[node]: | |
# Get the first member of each tuple, see | |
# http://stackoverflow.com/questions/12142133/how-to-get-first-element-in-a-list-of-tuples | |
queue_members = [x[0] for x in queue] | |
if parent not in result and parent not in queue_members: # Don't visit a second time | |
queue.append( (parent, dist+1) ) | |
return result | |
#====================================================================# | |
def main(): | |
# check the command-line arguments: | |
if len(sys.argv) != 2 or os.path.exists(sys.argv[1]) == False: | |
print("Usage: %s go_desc_file" % sys.argv[0]) | |
sys.exit(1) | |
go_desc_file = sys.argv[1] | |
# read in the ancestors of each GO term in the GO hierarchy: | |
(ancestors, terms) = read_go_ancestors(go_desc_file) | |
# calculate the distance from each GO term to the three terms at the top of the hierarchy | |
# (biologial process, molecular function, cellular component): | |
for go in terms: | |
bfs_dist_dict = BFS_dist_from_node(go, ancestors) | |
dist1 = bfs_dist_dict['GO:0003674'] if 'GO:0003674' in bfs_dist_dict else 1e+10 # 'molecular function' | |
dist2 = bfs_dist_dict['GO:0005575'] if 'GO:0005575' in bfs_dist_dict else 1e+10 # 'cellular component' | |
dist3 = bfs_dist_dict['GO:0008150'] if 'GO:0008150' in bfs_dist_dict else 1e+10 # 'biological process' | |
dist = min(dist1, dist2, dist3) | |
print("%s %d" % (go, dist)) | |
#====================================================================# | |
if __name__=="__main__": | |
main() | |
#====================================================================# | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment