Skip to content

Instantly share code, notes, and snippets.

@daler
Created February 6, 2014 13:09
Show Gist options
  • Save daler/8843828 to your computer and use it in GitHub Desktop.
Save daler/8843828 to your computer and use it in GitHub Desktop.
demo for getting sequences for GFF features
"""
Demonstration of getting the coding sequence for each alterative isoform of
each gene in a GFF file.
Uses pybedtools.BedTool.seq to extract sequences from a FASTA file.
"""
import gffutils
import pybedtools
from gffutils.helpers import asinterval
fasta = '/DATA/genomes/dm3/dm3.masked.fa'
gff = gffutils.example_filename('FBgn0031208.gff')
db = gffutils.create_db(gff, ':memory:')
# Iterate over all genes,
for gene in db.features_of_type('gene'):
# ...and all transcript for each gene
for transcript in db.children(gene, level=1):
coding_sequence = []
# ...and all CDSs for each transcript. To get mRNA sequence instead of
# coding sequence, you can use "exon" instead of "CDS" for
# well-formatted GFF files.
for cds in db.children(transcript, level=1, featuretype='CDS'):
coding_sequence.append(
pybedtools.BedTool.seq((cds.chrom, cds.start, cds.stop), fasta)
)
# We want to ignore genes with transcript but no CDS (e.g., rRNA)
if coding_sequence:
coding_sequence = ''.join(coding_sequence)
# This gives one long line for each seq. Might want to run through
# BioPython if you want nice 80-column wrapped format.
print ">{0.id}.{1.id}\n{2}".format(
gene, transcript, coding_sequence)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment