Skip to content

Instantly share code, notes, and snippets.

@radaniba
Created November 29, 2012 17:10
Show Gist options
  • Select an option

  • Save radaniba/4170456 to your computer and use it in GitHub Desktop.

Select an option

Save radaniba/4170456 to your computer and use it in GitHub Desktop.
The following ready-to-run script reads a genbank file, which is probably a genomic or chromosomal one. It uses the CDS feature to discover the 5' and 3' ends of ORFs. Yes, ORFs are not exactly synonymous with genes, but this is the way we did it. Also, y
#!/usr/bin/env python
import sys
import Bio
from Bio import SeqIO, SeqFeature
from Bio.SeqRecord import SeqRecord
import os
# Copyright(C) 2009 Iddo Friedberg & Ian MC Fleming
# Released under Biopython license. http://www.biopython.org/DIST/LICENSE
# Do not remove this comment
def get_interregions(genbank_path,intergene_length=1):
seq_record = SeqIO.parse(open(genbank_path), "genbank").next()
cds_list_plus = []
cds_list_minus = []
intergenic_records = []
# Loop over the genome file, get the CDS features on each of the strands
for feature in seq_record.features:
if feature.type == 'CDS':
mystart = feature.location._start.position
myend = feature.location._end.position
if feature.strand == -1:
cds_list_minus.append((mystart,myend,-1))
elif feature.strand == 1:
cds_list_plus.append((mystart,myend,1))
else:
sys.stderr.write("No strand indicated %d-%d. Assuming +\n" %
(mystart, myend))
cds_list_plus.append((mystart,myend,1))
for i,pospair in enumerate(cds_list_plus[1:]):
# Compare current start position to previous end position
last_end = cds_list_plus[i][1]
this_start = pospair[0]
strand = pospair[2]
if this_start - last_end >= intergene_length:
intergene_seq = seq_record.seq[last_end:this_start]
strand_string = "+"
intergenic_records.append(
SeqRecord(intergene_seq,id="%s-ign-%d" % (seq_record.name,i),
description="%s %d-%d %s" % (seq_record.name, last_end+1,
this_start,strand_string)))
for i,pospair in enumerate(cds_list_minus[1:]):
last_end = cds_list_minus[i][1]
this_start = pospair[0]
strand = pospair[2]
if this_start - last_end >= intergene_length:
intergene_seq = seq_record.seq[last_end:this_start]
strand_string = "-"
intergenic_records.append(
SeqRecord(intergene_seq,id="%s-ign-%d" % (seq_record.name,i),
description="%s %d-%d %s" % (seq_record.name, last_end+1,
this_start,strand_string)))
outpath = os.path.splitext(os.path.basename(genbank_path))[0] + "_ign.fasta"
SeqIO.write(intergenic_records, open(outpath,"w"), "fasta")
if __name__ == '__main__':
if len(sys.argv) == 2:
get_interregions(sys.argv[1])
elif len(sys.argv) == 3:
get_interregions(sys.argv[1],int(sys.argv[2]))
else:
print "Usage: get_intergenic.py gb_file [intergenic_length]"
sys.exit(0)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment