Created
February 6, 2014 13:09
-
-
Save daler/8843828 to your computer and use it in GitHub Desktop.
demo for getting sequences for GFF features
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Demonstration of getting the coding sequence for each alterative isoform of | |
each gene in a GFF file. | |
Uses pybedtools.BedTool.seq to extract sequences from a FASTA file. | |
""" | |
import gffutils | |
import pybedtools | |
from gffutils.helpers import asinterval | |
fasta = '/DATA/genomes/dm3/dm3.masked.fa' | |
gff = gffutils.example_filename('FBgn0031208.gff') | |
db = gffutils.create_db(gff, ':memory:') | |
# Iterate over all genes, | |
for gene in db.features_of_type('gene'): | |
# ...and all transcript for each gene | |
for transcript in db.children(gene, level=1): | |
coding_sequence = [] | |
# ...and all CDSs for each transcript. To get mRNA sequence instead of | |
# coding sequence, you can use "exon" instead of "CDS" for | |
# well-formatted GFF files. | |
for cds in db.children(transcript, level=1, featuretype='CDS'): | |
coding_sequence.append( | |
pybedtools.BedTool.seq((cds.chrom, cds.start, cds.stop), fasta) | |
) | |
# We want to ignore genes with transcript but no CDS (e.g., rRNA) | |
if coding_sequence: | |
coding_sequence = ''.join(coding_sequence) | |
# This gives one long line for each seq. Might want to run through | |
# BioPython if you want nice 80-column wrapped format. | |
print ">{0.id}.{1.id}\n{2}".format( | |
gene, transcript, coding_sequence) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment