Skip to content

Instantly share code, notes, and snippets.

@FSX
Created December 19, 2011 18:47
Show Gist options
  • Save FSX/1498361 to your computer and use it in GitHub Desktop.
Save FSX/1498361 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python2
"""
Reads a Genbank file line by line.
"""
import re
from pprint import pprint
OPT_COMP = (1 << 0)
OPT_JOIN = (1 << 1)
def extract_cds_ranges(fd):
buf = ''
ranges = []
for line in fd:
if 'CDS' in line[:21]:
buf += line[21:].strip()
elif buf:
line = line.strip()
if not line.startswith('/'):
buf += line
else:
# Get options
options = 0
if 'complement' in buf:
options |= OPT_COMP
if 'join' in buf:
options |= OPT_JOIN
# Remove everything but the ranges
if '(' in buf:
o = buf.rfind('(')
c = buf.find(')')
buf = buf[o+1:c]
# Parse ranges
if options & OPT_JOIN:
tmp = []
for raw_range in buf.split(','):
s, e = raw_range.split('..')
tmp.append((int(s), int(e)))
buf = tuple(tmp)
else:
s, e = buf.split('..')
buf = ((int(s), int(e)),)
ranges.append((options, buf))
buf = ''
# The ORIGIN data is not going to be processed in this function
if line.startswith('ORIGIN'):
break
return tuple(ranges)
if __name__ == '__main__':
with open('Plasmodium falciparum 3D7 chromosome 5 (genbank).txt', 'r') as fd:
ranges = extract_cds_ranges(fd)
pprint(ranges)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment