Skip to content

Instantly share code, notes, and snippets.

@bdunnette
Last active August 29, 2015 14:03
Show Gist options
  • Save bdunnette/a46f7aa95f258ca82554 to your computer and use it in GitHub Desktop.
Save bdunnette/a46f7aa95f258ca82554 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
A constant-space parser for the GeneOntology OBO v1.2 format
Modified from work by Uli Koehler: http://techoverflow.net/blog/2013/11/18/a-geneontology-obo-v1.2-parser-in-python/
"""
from __future__ import with_statement
from collections import defaultdict
import types
import couchdb
couch = couchdb.Server()
db = couch['doid']
__author__ = "Uli Koehler"
__copyright__ = "Copyright 2013 Uli Koehler"
__license__ = "Apache v2.0"
def processGOTerm(goTerm):
"""
In an object representing a GO term, replace single-element lists with
their only member.
Returns the modified object as a dictionary.
"""
ret = dict(goTerm) #Input is a defaultdict, might express unexpected behaviour
for key, value in ret.iteritems():
if len(value) == 1:
ret[key] = value[0]
return ret
def parseGOOBO(filename):
"""
Parses a Gene Ontology dump in OBO v1.2 format.
Yields each
Keyword arguments:
filename: The filename to read
"""
with open(filename, "r") as infile:
currentGOTerm = None
for line in infile:
line = line.strip()
if not line: continue #Skip empty
if line == "[Term]":
if currentGOTerm: yield processGOTerm(currentGOTerm)
currentGOTerm = defaultdict(list)
elif line == "[Typedef]":
#Skip [Typedef sections]
currentGOTerm = None
else: #Not [Term]
#Only process if we're inside a [Term] environment
if currentGOTerm is None: continue
key, sep, val = line.partition(":")
currentGOTerm[key].append(val.strip())
#Add last term
if currentGOTerm is not None:
yield processGOTerm(currentGOTerm)
if __name__ == "__main__":
"""Print out the number of GO objects in the given GO OBO file"""
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('infile', help='The input file in GO OBO v1.2 format.')
args = parser.parse_args()
#Iterate over GO terms
termCounter = 0
newDocs = []
for goTerm in parseGOOBO(args.infile):
newDoc = goTerm
newDoc['_id'] = newDoc['id']
if 'is_a' in newDoc:
newDoc['parents'] = []
if isinstance(newDoc['is_a'], basestring):
parent = newDoc['is_a']
newDoc['parents'].append(parent.split('!')[0].strip())
print newDoc
newDocs.append(newDoc)
termCounter += 1
print "Found %d GO terms" % termCounter
for doc in db.update(newDocs):
print repr(doc)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment