Last active
August 29, 2015 14:03
-
-
Save bdunnette/a46f7aa95f258ca82554 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
""" | |
A constant-space parser for the GeneOntology OBO v1.2 format | |
Modified from work by Uli Koehler: http://techoverflow.net/blog/2013/11/18/a-geneontology-obo-v1.2-parser-in-python/ | |
""" | |
from __future__ import with_statement | |
from collections import defaultdict | |
import types | |
import couchdb | |
couch = couchdb.Server() | |
db = couch['doid'] | |
__author__ = "Uli Koehler" | |
__copyright__ = "Copyright 2013 Uli Koehler" | |
__license__ = "Apache v2.0" | |
def processGOTerm(goTerm): | |
""" | |
In an object representing a GO term, replace single-element lists with | |
their only member. | |
Returns the modified object as a dictionary. | |
""" | |
ret = dict(goTerm) #Input is a defaultdict, might express unexpected behaviour | |
for key, value in ret.iteritems(): | |
if len(value) == 1: | |
ret[key] = value[0] | |
return ret | |
def parseGOOBO(filename): | |
""" | |
Parses a Gene Ontology dump in OBO v1.2 format. | |
Yields each | |
Keyword arguments: | |
filename: The filename to read | |
""" | |
with open(filename, "r") as infile: | |
currentGOTerm = None | |
for line in infile: | |
line = line.strip() | |
if not line: continue #Skip empty | |
if line == "[Term]": | |
if currentGOTerm: yield processGOTerm(currentGOTerm) | |
currentGOTerm = defaultdict(list) | |
elif line == "[Typedef]": | |
#Skip [Typedef sections] | |
currentGOTerm = None | |
else: #Not [Term] | |
#Only process if we're inside a [Term] environment | |
if currentGOTerm is None: continue | |
key, sep, val = line.partition(":") | |
currentGOTerm[key].append(val.strip()) | |
#Add last term | |
if currentGOTerm is not None: | |
yield processGOTerm(currentGOTerm) | |
if __name__ == "__main__": | |
"""Print out the number of GO objects in the given GO OBO file""" | |
import argparse | |
parser = argparse.ArgumentParser() | |
parser.add_argument('infile', help='The input file in GO OBO v1.2 format.') | |
args = parser.parse_args() | |
#Iterate over GO terms | |
termCounter = 0 | |
newDocs = [] | |
for goTerm in parseGOOBO(args.infile): | |
newDoc = goTerm | |
newDoc['_id'] = newDoc['id'] | |
if 'is_a' in newDoc: | |
newDoc['parents'] = [] | |
if isinstance(newDoc['is_a'], basestring): | |
parent = newDoc['is_a'] | |
newDoc['parents'].append(parent.split('!')[0].strip()) | |
print newDoc | |
newDocs.append(newDoc) | |
termCounter += 1 | |
print "Found %d GO terms" % termCounter | |
for doc in db.update(newDocs): | |
print repr(doc) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment