Created
December 19, 2011 18:47
-
-
Save FSX/1498361 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python2 | |
""" | |
Reads a Genbank file line by line. | |
""" | |
import re | |
from pprint import pprint | |
OPT_COMP = (1 << 0) | |
OPT_JOIN = (1 << 1) | |
def extract_cds_ranges(fd): | |
buf = '' | |
ranges = [] | |
for line in fd: | |
if 'CDS' in line[:21]: | |
buf += line[21:].strip() | |
elif buf: | |
line = line.strip() | |
if not line.startswith('/'): | |
buf += line | |
else: | |
# Get options | |
options = 0 | |
if 'complement' in buf: | |
options |= OPT_COMP | |
if 'join' in buf: | |
options |= OPT_JOIN | |
# Remove everything but the ranges | |
if '(' in buf: | |
o = buf.rfind('(') | |
c = buf.find(')') | |
buf = buf[o+1:c] | |
# Parse ranges | |
if options & OPT_JOIN: | |
tmp = [] | |
for raw_range in buf.split(','): | |
s, e = raw_range.split('..') | |
tmp.append((int(s), int(e))) | |
buf = tuple(tmp) | |
else: | |
s, e = buf.split('..') | |
buf = ((int(s), int(e)),) | |
ranges.append((options, buf)) | |
buf = '' | |
# The ORIGIN data is not going to be processed in this function | |
if line.startswith('ORIGIN'): | |
break | |
return tuple(ranges) | |
if __name__ == '__main__': | |
with open('Plasmodium falciparum 3D7 chromosome 5 (genbank).txt', 'r') as fd: | |
ranges = extract_cds_ranges(fd) | |
pprint(ranges) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment