Last active
October 2, 2019 15:30
-
-
Save mpkocher/b117acefb807af10ba81 to your computer and use it in GitHub Desktop.
Example of using pbcore's FastaReader with generators in Python
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import functools | |
import numpy as np | |
from pbcore.io import FastaReader, FastaRecord | |
def _to_records(func, file_name): | |
with FastaReader(file_name) as f: | |
for record in f: | |
yield func(record) | |
def to_pbcore_record(r): | |
""" | |
This is the identity operator. | |
:type r: FastaRecord | |
""" | |
return r | |
def to_my_record(r): | |
""" | |
:type r: FastaRecord | |
""" | |
return r.name, len(r.sequence) | |
def to_length(r): | |
""" | |
:type r: FastaRecord | |
""" | |
return len(r.sequence) | |
def to_movie_id(r): | |
""" | |
header looks like: | |
>m120201_042231_42129_c100275262550000001523007907041260_s1_p0/9/0_1756 [revcomp] | |
""" | |
s = r.name.split(' ')[0].split('/') | |
return s[0], s[1], s[2], len(r.sequence) | |
def to_pacbio_record_or_raise(r): | |
if ":" in r.name: | |
raise ValueError("Malformaed contig name '{n}'".format(n=r.name)) | |
return r.name, len(r.sequence) | |
def example_01(file_name): | |
to_record = functools.partial(_to_records, to_my_record) | |
for r in to_record(file_name): | |
print r | |
def example_02(file_name): | |
for r in _to_records(to_movie_id, file_name): | |
print r | |
def example_03(file_name): | |
for r in _to_records(to_pacbio_record_or_raise, file_name): | |
print r | |
def example_04(file_name): | |
"""run all | |
""" | |
funcs = [to_my_record, to_movie_id, to_pacbio_record_or_raise] | |
for func in funcs: | |
for r in _to_records(func, file_name): | |
print r | |
def example_05(file_name): | |
contig_lengths = np.fromiter(_to_records(to_length, file_name), dtype=np.int64, count=-1) | |
print contig_lengths | |
return contig_lengths | |
def run_examples(): | |
f = 'small.rc.fasta' | |
examples = [example_01, example_02, example_03, example_04, example_05] | |
for func in examples: | |
func(f) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
>m120201_042231_42129_c100275262550000001523007907041260_s1_p0/9/0_1756 [revcomp] | |
TCGCCTTCCGTGTCGCTTACGCGAAATTGAGAAGAACTGAATGCAGTGATGCCGTACCGAACCGGCGTAC | |
GCTCGCATGAAGCAGGCATAAATATCCATCTGATGGAAATGCCGGGGCTGAACTCGGTATATCTCTGTAT | |
AACCGTGGGCGAAAACACCACTCGATGCACGTGAAAGTTCGCCAGGCTCTGACCTACGCGTTGAACAAAG | |
ACGGATCATCAAAGCCGGTTTTATCAGGCGGCGGCGTCATCGCGAAAAACTGATCCCGCCACCATGTGGG | |
GCTATCACGACGACGTTTCAGGACTACACCTCGATCCTGAAAAGCGGAAGCCTTGGCTGAAGAAGAGGGT | |
CTGGAAAAAGGTCTCCCATCGGACCTGTGGGCGATGCCGGTACAAATTCCGTTATAACCCGAACGGCTCG | |
GCCGCATGGCGGAGTTATGATTCAGGGCAGACCTGGCGAAAGGTCGGCGTGCAGGCAAATTGTCACCTAC | |
GATGGTGGTGAGTACCTCAAGCGTGCGAAAGATGGCGAGCACCAGACCGGTAATGATGGGCTGGACTGGC | |
GTACGGGGATCCGGTATCTTCGCCACCCTGTTCAGTCTGCGCCGCCTTGAACAAGGCTCCAAACTACTCA | |
AAATGGTGCTACAAAACGTTTGAAGATCTGATCTTCACCCGGCGCGTGCTACGACGACATCCAATAACGC | |
GTGACTGACAACAGCGCAGGTGGTTGATGCACGACATCAGGCTCCGGCACTGATTCATCGCTCACTCACC | |
GTGTTGGAACCGGTACGTAAAGAAGTTAAAGGCTATGTGGTTGATTCCATTAGGCAACATCACTTCGAAA | |
CGTCTCTATCGATAATTAAAAGGCCATACAAGACTGATGCAAAGGCAAAAATGCCTGGATGCGCTCCGCC | |
CTTATCAGGCCTACGAATTCTGCAATGTATAATTTGCACGATTTTGTAGGCCGGATAAGCGGTTAACGCC | |
GCCATCGGCTATACCAAAGCGCACTTGTCACAATCTTATACCGTGGCCGCTGTGCTCTGCGGCAGCGCCA | |
TCGGCAGCATACTTCTCCCTGTCGATAAGGGCGGGAATGATTTGTGGGCAATACGACACGCAGTTCCAGG | |
CTGCGGGTCACTACTAGAGAATCCCGGGTGGGGGGGTTATGTTTGCAGTTTATTTTCTCGACGTTTGCGG | |
ATTCGTCATCCCCACGTTAATCGGTATTCCCTTCTCACATTTGCCTTTGTCCACATGATCCGGGCGATCC | |
GGTGTATGATCATGGCGGGATGGGCGAACCGTGGGATTCTCCCAGAGCGTCAACGCGCAGCTGCTGGCTG | |
GAACTCGGCTTAGATAAAACCGATGTGGCACAGTTATCTCCATTACATTGGCGTTATGCATGGCGAATCT | |
AGGACATTCAATGAAAAGCGCCATCCCGGTTTGGGAAGAGTTCGTGCGCGCTTCCAGGGCCACGCTGGAA | |
CTGGCGTCTGCGCATGATTTTTGGCTACGGCAGTCGTATTCCGGTCGGCGTGGCTGGCGGCGGGTAAACG | |
CTGTTCCTTTCGATCCACCAGCGGTTGCTGGCGCTGACAGGTTATTCAATGCCTTATTTCGGTGGGCATG | |
ATGCTGATCATGCTGTCGGTGCACTGGAACCTGGACGCCCGTTCCGTGCTGGAGGATATCGGTTGTTCCT | |
CGATGACTCCAATCGTAACCGTGTTTATGCTATCGACACCGCCATCTGGGTGGAAGACGGAACCTTTCGA | |
TGGCCG | |
>m120201_042231_42129_c100275262550000001523007907041260_s1_p0/11/1513_4949 [revcomp] | |
TTTAATGTCAGCGGTATTGCTTTTGCAGGGCACTATAAAGCGTGACGTTTTGCCGCTGCGGCTAGGGCCA | |
GTTACCAGCACGTCCTGTGGTTGTTGCTCAAAGGCATTGAGAAAGTCGACCCAGTTGTAACGGTGGCATT | |
CCAGCGTGGTGCATCCAGTGCCTGACCACCTTGCTGTAACCACCTTAATACCCTTTTTTCACCCCGACAT | |
GGTAGGTCGCATACGAAACTGAGACGGCGTTTTCCTGCAGTTCGACAGTGAATTGCCCGCTCCTGCGGCC | |
GGCGATGTTCCGCAATATCAGGTTTCCCAGCACTTTTAATCTGGCGGTTTAATTGCGACTCCGCGCATCC | |
GGTGAACATCCGGTAAAAGGATGCAATACGCGCGTCGATACGCAGCGGGATGCGGTGGCATTGTCCGCTG | |
GGTTCGAGTTGATATCGAGCGCGGTTGTTTCGCGCAGATTGTAACGTTCGAGTTAGCAACTCTGCTTTGG | |
TGATGACTTCTCAACTGACAGCTACGGCGCAATGTTGTTGGTGTGCGGGCTGGCGTGACTCCATTTGTTG | |
GCGCCCGTCCAGCAGGTGATCATCATACGTTTGGTAGGTAGCGAAATAGCAAATGCGTCCAGTCGGCTCA | |
TGCGAATGGTGCAATCGACTACCGCCAACATGAACCACCTTCTTTCGCTGGCTCAGCAAGACTCCATGAT | |
AACGCAGACCACAGGCAGTGAGCCTGTGGAATATTCATTGCCGCTCCTTCGTGGGCGTCAATCAGCGGAA | |
GACATTTCGCAGGTTGCTGCAATGCAGCTGTCACTTTGGATTATTGCAGTTGCGCTGTTCCAGCCGGTGA | |
ACGCCGGTTTGCGTTATCCAACCCGGTGTCTGGACGACGCTTAGGCCATTGGCCTTTCCTTGCCGGTCAG | |
CGACACCACGCCCTTTGCCACACTCATGGCTGAACATGCGGGTGCGTGGTAGGCGAGGGAATGCCATTGC | |
TGCCACCGTCGCAGGTATCCTAATCATCCATGTTCCAGCGACGGCACAACTATCGGCGGTACGGTAAGCG | |
CACAAGGTTTGTAGCATGTCGGTGGTGCGGCTTTGCGCAGGTAGTTTTGATAAGCGGGTCCAATGGCGCT | |
TAAAATGGCATGAATGCAATAACCACTCAGTTCGATAGTGTAAAACCGCGTTGTTGGTCCATCTGTTCGC | |
TCCTTGATTTGCGTTGGCGCTACCTTTGGCAAACGCCCTCAGCACAGCGTAGGGGCAAAAACGAAACGGG | |
GAAAGCGATTCCCGAGGTTTTTATTTCGTTGCAGCGAAAGACAAGAAATTTGCGGGCGTTACGAAGAAAG | |
TTGGGGGAGGGGAGATTATCCGCCCGCGATCGAGCGGATAAATCTGTAATATTGCGAAAACGGCATTGAG | |
GTCGGAGTGCTTGTACGTGTTTAGTTAGCGCACGACGTCTTTGAGATAAAGTCCACGGCCCGTTCGCGCA | |
ATTCACGCAGTGTTTTGTCGTGGACGTTGCCGACATTCCGTAGCGCCTTGCCGTTGGTGGTTTTGACGGC | |
TTCGCGCATCTGTTCTGTTTCGGCTTATCCCAGCATGATGCTATCGGCTCCTGCTTTTTCAGGGCTTCAT | |
CAAGTTCTTCCAGATTCTCTCTTTCGATGTTCTGCGCATCCGGGTGCAGCCAGGCGCTTTTTTCGACCCG | |
CCTGGCGCCTGAGCCCGGAGGGCAAATAATATGGTTTTCTTTGTATCAGGAAGGCATCAGAAGCCCCGGA | |
CGTTGATTCGCTCCGCCGCCGCAAAGTCGCGTTTTCAGAAGCTGAACGCAGGCCGGGTAAGGTTTTGCGC | |
GTATCCAACAACTGGCGTGTTGGTGCCTTCCAGCATTCGACACTAGGTGGTACCTTACTGGCCAACTCCT | |
GAAAGGGTTTGCACAAATTAAGCGCATGCGTTCGCCCGTACGCACGCGGGATGGGGCCTTCAAGTTCGAA | |
CAAGGATGATTGGCATTGATTGCTCGCCGTTCACATGCCAGATTATGGGTGACATCGTCGCCTGCCAGTT | |
GAAAACACTCTTAACCGCGTTTGCGCAAAGACGCCATTCTCGCGCAAGGTGATAACCGTGGCATGAGAGC | |
GACTGAACTTTTCGGTAAGTTTTGCGTAATATCATTGTTGGCATCGACTGTTTCCGCCTAATCTTCCCGC | |
TCGCTTGGGCCACCGCGCCGGGATATCGAGCATTAAATTGCGTTCAAGCAGTCGTGCACGTCGGTGTCAG | |
GTATACGGCGGGAGCGGCATGTTAAACTCCAGACGTAGCTAACGAATCATAAGGTAGAACATGCTTACTC | |
TGAACCGGGTACTTGCACACAATTAAGGTCTGCATGTTGTTAGAACAGGGGTGGCTGGTTGGCGCGCGCG | |
CGCGTTTCCTACCACATTACGTTGGCGCCCCGCCCCCGATGCGGACACGCCCGGCTGTGAGGTGCACAAT | |
ATTAGCCTGCCGCAGGCGGTTTGGCGGTCCGTGGATCGGACGGCATTATTCACTGGAACTATTTATTCCG | |
CAGGCCATCTTTCTTTGCTGAGATCGCCATTTAGCGCGTCTCCGTCACTGTTTGATTCGCTGATGGTGAA | |
ATAGTCCAGTATGTTCCTTTCGTAAACGTTGCATGGCATGCGGGAGTCTCTCAGTATCAGGGGCGGCGAC | |
GCTGCAATGATTTTTCTATTGGGATTGAGCTTGAAGGCACGATACGCTGGCGTATTACGATGCGCGTATT | |
CAACAGTGGTGTCGTTGTGGCCATCCTGGGCGGATTGTTACCTGTGGTGTGCTCATACATATTTTTGTAA | |
TGTAGCTCTGCAGTGTGTCACGCGATATTCCGCTTGTCACCTGGAACGTGCCGGCGCATATTTCGTAGGA | |
TTGTAGCGTCGGCAGCCCATTCACGTCGGAAATTTTCCCGACATAGCATGCAGGAGGTCCATGCTATGTG | |
TAACTGTTATATGTGCTGGGGGCGCGAACGGTCGTGTCTGTGCTGTGTCTCGCTTGGCAATCGGATGTTA | |
CTAATGTCGACGCTAGTCCTCTCTCGGCAATACGACCTCGTATCACATGTGTGTATTCAGTTCTGGGGCG | |
TCTGTATTATAGTAGAGTGCTGTGAGCGATGTGGCAGCTATGGTATCATCTGTCACTGTTGTAAAGCTGT | |
CTGTGGTCTTCGGACGCAGCTCGTTTACGCCCTTCAGGTGTGCGGGCGATAAAACATTTTGATCCGTGTT | |
TTTTAAATACCAGCGTCAACTCGTTAGCATTAACCATCATCTGATTTTTTCTTTGGCTGGAATGGCCGGA | |
ACATTAATGTAGAACAGCGACTTTTTATTTTCAGGCAGGTTATCAGCTGTTTTATAATTCGCCAGGGATG | |
AGTCATCGCCAGCCTCAAGCGAAAATAGCGGTGGGGTTATAATAAATGCGGGCACGGCTTTTACCATCTA | |
TATTAT | |
>m120201_042231_42129_c100275262550000001523007907041260_s1_p0/13/0_68 [revcomp] | |
GCTGACTGAGGGTTGTGCACTTCACTTCGAACAGATGCTTGCCGTAGACTAGGCTCGCGTGTCGAGGC | |
>m120201_042231_42129_c100275262550000001523007907041260_s1_p0/13/117_519 [revcomp] | |
CATTTTTTTCTTTGGGTTGGTGGGAAAGCGTGTGGTTGGAGGACGGGCCGACATACTGGCCCGTTACTAT | |
GTCGACGCGGTGCGTGAATATTGTATGCTTGTGTGGCAACTAGCTGAGGGATTGAGTCATTCGCCTTGCG | |
CGTATTGCTTGTTGAAGTTGGCGGGGCCGGCCTGTTATTAGCGGGGGTCATCGCCACTGCGCGGCATGGT | |
GGAAGGGGCTTTGAGTAGGGTTTCGCCGAGCTCTTCAGCATCGCGTCTTTCGGTGAAAGCGGGCGCAATG | |
GCGACAGCTGTGTGTGTGATCCTACGCCGCGAATCGACGTGTCCGAAAATAGTAACTGTACGGCATGACT | |
TCGGCCGTGATGCTTCGGTGTTTAGAGCTTGTTGAAGTTGGGAGTCATGTCG |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment