Created
June 23, 2015 07:54
-
-
Save soh-i/0de55a2a62ceb0221da0 to your computer and use it in GitHub Desktop.
Pretty simple CLI tool for generating fasta record from genomic region defined by GFF record
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from pyfasta import Fasta | |
| import HTSeq | |
| import argparse | |
| import re | |
| def __mk_rna(*seq): | |
| return re.sub(r"[tT]", "U", "".join(seq)).upper() | |
| def __query_builder(rec): | |
| if not isinstance(rec, HTSeq.GenomicFeature): | |
| raise TypeError() | |
| else: | |
| return {"chr": rec.iv.chrom, "start": rec.iv.start, "stop": rec.iv.end, "strand": rec .iv.strand} | |
| def __extract_sequence(q, genome): | |
| fa = Fasta(genome) | |
| return fa.sequence(q) | |
| def __to_fasta(header, seq): | |
| return ">%s\n%s" % (header, seq) | |
| def GFF_to_Fasta(args): | |
| for rec in HTSeq.GFF_Reader(args.gff): | |
| query = __query_builder(rec) | |
| seq = __extract_sequence(query, args.genome) | |
| header = rec.attr["Name"] + ":" + rec.attr["transcript_id"] | |
| print __to_fasta(header, __mk_rna(seq)) | |
| if __name__ == "__main__": | |
| parse = argparse.ArgumentParser(description="Generate fasta sequence from GFF record") | |
| parse.add_argument("--genome", type=str, dest="genome", help="Genome sequence", required=True) | |
| parse.add_argument("--gff", type=str, dest="gff", help="GFF file", required=True) | |
| args = parse.parse_args() | |
| GFF_to_Fasta(args) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment