Skip to content

Instantly share code, notes, and snippets.

@mpkocher
Last active October 2, 2019 15:30
Show Gist options
  • Save mpkocher/b117acefb807af10ba81 to your computer and use it in GitHub Desktop.
Save mpkocher/b117acefb807af10ba81 to your computer and use it in GitHub Desktop.
Example of using pbcore's FastaReader with generators in Python
import os
import functools
import numpy as np
from pbcore.io import FastaReader, FastaRecord
def _to_records(func, file_name):
with FastaReader(file_name) as f:
for record in f:
yield func(record)
def to_pbcore_record(r):
"""
This is the identity operator.
:type r: FastaRecord
"""
return r
def to_my_record(r):
"""
:type r: FastaRecord
"""
return r.name, len(r.sequence)
def to_length(r):
"""
:type r: FastaRecord
"""
return len(r.sequence)
def to_movie_id(r):
"""
header looks like:
>m120201_042231_42129_c100275262550000001523007907041260_s1_p0/9/0_1756 [revcomp]
"""
s = r.name.split(' ')[0].split('/')
return s[0], s[1], s[2], len(r.sequence)
def to_pacbio_record_or_raise(r):
if ":" in r.name:
raise ValueError("Malformaed contig name '{n}'".format(n=r.name))
return r.name, len(r.sequence)
def example_01(file_name):
to_record = functools.partial(_to_records, to_my_record)
for r in to_record(file_name):
print r
def example_02(file_name):
for r in _to_records(to_movie_id, file_name):
print r
def example_03(file_name):
for r in _to_records(to_pacbio_record_or_raise, file_name):
print r
def example_04(file_name):
"""run all
"""
funcs = [to_my_record, to_movie_id, to_pacbio_record_or_raise]
for func in funcs:
for r in _to_records(func, file_name):
print r
def example_05(file_name):
contig_lengths = np.fromiter(_to_records(to_length, file_name), dtype=np.int64, count=-1)
print contig_lengths
return contig_lengths
def run_examples():
f = 'small.rc.fasta'
examples = [example_01, example_02, example_03, example_04, example_05]
for func in examples:
func(f)
>m120201_042231_42129_c100275262550000001523007907041260_s1_p0/9/0_1756 [revcomp]
TCGCCTTCCGTGTCGCTTACGCGAAATTGAGAAGAACTGAATGCAGTGATGCCGTACCGAACCGGCGTAC
GCTCGCATGAAGCAGGCATAAATATCCATCTGATGGAAATGCCGGGGCTGAACTCGGTATATCTCTGTAT
AACCGTGGGCGAAAACACCACTCGATGCACGTGAAAGTTCGCCAGGCTCTGACCTACGCGTTGAACAAAG
ACGGATCATCAAAGCCGGTTTTATCAGGCGGCGGCGTCATCGCGAAAAACTGATCCCGCCACCATGTGGG
GCTATCACGACGACGTTTCAGGACTACACCTCGATCCTGAAAAGCGGAAGCCTTGGCTGAAGAAGAGGGT
CTGGAAAAAGGTCTCCCATCGGACCTGTGGGCGATGCCGGTACAAATTCCGTTATAACCCGAACGGCTCG
GCCGCATGGCGGAGTTATGATTCAGGGCAGACCTGGCGAAAGGTCGGCGTGCAGGCAAATTGTCACCTAC
GATGGTGGTGAGTACCTCAAGCGTGCGAAAGATGGCGAGCACCAGACCGGTAATGATGGGCTGGACTGGC
GTACGGGGATCCGGTATCTTCGCCACCCTGTTCAGTCTGCGCCGCCTTGAACAAGGCTCCAAACTACTCA
AAATGGTGCTACAAAACGTTTGAAGATCTGATCTTCACCCGGCGCGTGCTACGACGACATCCAATAACGC
GTGACTGACAACAGCGCAGGTGGTTGATGCACGACATCAGGCTCCGGCACTGATTCATCGCTCACTCACC
GTGTTGGAACCGGTACGTAAAGAAGTTAAAGGCTATGTGGTTGATTCCATTAGGCAACATCACTTCGAAA
CGTCTCTATCGATAATTAAAAGGCCATACAAGACTGATGCAAAGGCAAAAATGCCTGGATGCGCTCCGCC
CTTATCAGGCCTACGAATTCTGCAATGTATAATTTGCACGATTTTGTAGGCCGGATAAGCGGTTAACGCC
GCCATCGGCTATACCAAAGCGCACTTGTCACAATCTTATACCGTGGCCGCTGTGCTCTGCGGCAGCGCCA
TCGGCAGCATACTTCTCCCTGTCGATAAGGGCGGGAATGATTTGTGGGCAATACGACACGCAGTTCCAGG
CTGCGGGTCACTACTAGAGAATCCCGGGTGGGGGGGTTATGTTTGCAGTTTATTTTCTCGACGTTTGCGG
ATTCGTCATCCCCACGTTAATCGGTATTCCCTTCTCACATTTGCCTTTGTCCACATGATCCGGGCGATCC
GGTGTATGATCATGGCGGGATGGGCGAACCGTGGGATTCTCCCAGAGCGTCAACGCGCAGCTGCTGGCTG
GAACTCGGCTTAGATAAAACCGATGTGGCACAGTTATCTCCATTACATTGGCGTTATGCATGGCGAATCT
AGGACATTCAATGAAAAGCGCCATCCCGGTTTGGGAAGAGTTCGTGCGCGCTTCCAGGGCCACGCTGGAA
CTGGCGTCTGCGCATGATTTTTGGCTACGGCAGTCGTATTCCGGTCGGCGTGGCTGGCGGCGGGTAAACG
CTGTTCCTTTCGATCCACCAGCGGTTGCTGGCGCTGACAGGTTATTCAATGCCTTATTTCGGTGGGCATG
ATGCTGATCATGCTGTCGGTGCACTGGAACCTGGACGCCCGTTCCGTGCTGGAGGATATCGGTTGTTCCT
CGATGACTCCAATCGTAACCGTGTTTATGCTATCGACACCGCCATCTGGGTGGAAGACGGAACCTTTCGA
TGGCCG
>m120201_042231_42129_c100275262550000001523007907041260_s1_p0/11/1513_4949 [revcomp]
TTTAATGTCAGCGGTATTGCTTTTGCAGGGCACTATAAAGCGTGACGTTTTGCCGCTGCGGCTAGGGCCA
GTTACCAGCACGTCCTGTGGTTGTTGCTCAAAGGCATTGAGAAAGTCGACCCAGTTGTAACGGTGGCATT
CCAGCGTGGTGCATCCAGTGCCTGACCACCTTGCTGTAACCACCTTAATACCCTTTTTTCACCCCGACAT
GGTAGGTCGCATACGAAACTGAGACGGCGTTTTCCTGCAGTTCGACAGTGAATTGCCCGCTCCTGCGGCC
GGCGATGTTCCGCAATATCAGGTTTCCCAGCACTTTTAATCTGGCGGTTTAATTGCGACTCCGCGCATCC
GGTGAACATCCGGTAAAAGGATGCAATACGCGCGTCGATACGCAGCGGGATGCGGTGGCATTGTCCGCTG
GGTTCGAGTTGATATCGAGCGCGGTTGTTTCGCGCAGATTGTAACGTTCGAGTTAGCAACTCTGCTTTGG
TGATGACTTCTCAACTGACAGCTACGGCGCAATGTTGTTGGTGTGCGGGCTGGCGTGACTCCATTTGTTG
GCGCCCGTCCAGCAGGTGATCATCATACGTTTGGTAGGTAGCGAAATAGCAAATGCGTCCAGTCGGCTCA
TGCGAATGGTGCAATCGACTACCGCCAACATGAACCACCTTCTTTCGCTGGCTCAGCAAGACTCCATGAT
AACGCAGACCACAGGCAGTGAGCCTGTGGAATATTCATTGCCGCTCCTTCGTGGGCGTCAATCAGCGGAA
GACATTTCGCAGGTTGCTGCAATGCAGCTGTCACTTTGGATTATTGCAGTTGCGCTGTTCCAGCCGGTGA
ACGCCGGTTTGCGTTATCCAACCCGGTGTCTGGACGACGCTTAGGCCATTGGCCTTTCCTTGCCGGTCAG
CGACACCACGCCCTTTGCCACACTCATGGCTGAACATGCGGGTGCGTGGTAGGCGAGGGAATGCCATTGC
TGCCACCGTCGCAGGTATCCTAATCATCCATGTTCCAGCGACGGCACAACTATCGGCGGTACGGTAAGCG
CACAAGGTTTGTAGCATGTCGGTGGTGCGGCTTTGCGCAGGTAGTTTTGATAAGCGGGTCCAATGGCGCT
TAAAATGGCATGAATGCAATAACCACTCAGTTCGATAGTGTAAAACCGCGTTGTTGGTCCATCTGTTCGC
TCCTTGATTTGCGTTGGCGCTACCTTTGGCAAACGCCCTCAGCACAGCGTAGGGGCAAAAACGAAACGGG
GAAAGCGATTCCCGAGGTTTTTATTTCGTTGCAGCGAAAGACAAGAAATTTGCGGGCGTTACGAAGAAAG
TTGGGGGAGGGGAGATTATCCGCCCGCGATCGAGCGGATAAATCTGTAATATTGCGAAAACGGCATTGAG
GTCGGAGTGCTTGTACGTGTTTAGTTAGCGCACGACGTCTTTGAGATAAAGTCCACGGCCCGTTCGCGCA
ATTCACGCAGTGTTTTGTCGTGGACGTTGCCGACATTCCGTAGCGCCTTGCCGTTGGTGGTTTTGACGGC
TTCGCGCATCTGTTCTGTTTCGGCTTATCCCAGCATGATGCTATCGGCTCCTGCTTTTTCAGGGCTTCAT
CAAGTTCTTCCAGATTCTCTCTTTCGATGTTCTGCGCATCCGGGTGCAGCCAGGCGCTTTTTTCGACCCG
CCTGGCGCCTGAGCCCGGAGGGCAAATAATATGGTTTTCTTTGTATCAGGAAGGCATCAGAAGCCCCGGA
CGTTGATTCGCTCCGCCGCCGCAAAGTCGCGTTTTCAGAAGCTGAACGCAGGCCGGGTAAGGTTTTGCGC
GTATCCAACAACTGGCGTGTTGGTGCCTTCCAGCATTCGACACTAGGTGGTACCTTACTGGCCAACTCCT
GAAAGGGTTTGCACAAATTAAGCGCATGCGTTCGCCCGTACGCACGCGGGATGGGGCCTTCAAGTTCGAA
CAAGGATGATTGGCATTGATTGCTCGCCGTTCACATGCCAGATTATGGGTGACATCGTCGCCTGCCAGTT
GAAAACACTCTTAACCGCGTTTGCGCAAAGACGCCATTCTCGCGCAAGGTGATAACCGTGGCATGAGAGC
GACTGAACTTTTCGGTAAGTTTTGCGTAATATCATTGTTGGCATCGACTGTTTCCGCCTAATCTTCCCGC
TCGCTTGGGCCACCGCGCCGGGATATCGAGCATTAAATTGCGTTCAAGCAGTCGTGCACGTCGGTGTCAG
GTATACGGCGGGAGCGGCATGTTAAACTCCAGACGTAGCTAACGAATCATAAGGTAGAACATGCTTACTC
TGAACCGGGTACTTGCACACAATTAAGGTCTGCATGTTGTTAGAACAGGGGTGGCTGGTTGGCGCGCGCG
CGCGTTTCCTACCACATTACGTTGGCGCCCCGCCCCCGATGCGGACACGCCCGGCTGTGAGGTGCACAAT
ATTAGCCTGCCGCAGGCGGTTTGGCGGTCCGTGGATCGGACGGCATTATTCACTGGAACTATTTATTCCG
CAGGCCATCTTTCTTTGCTGAGATCGCCATTTAGCGCGTCTCCGTCACTGTTTGATTCGCTGATGGTGAA
ATAGTCCAGTATGTTCCTTTCGTAAACGTTGCATGGCATGCGGGAGTCTCTCAGTATCAGGGGCGGCGAC
GCTGCAATGATTTTTCTATTGGGATTGAGCTTGAAGGCACGATACGCTGGCGTATTACGATGCGCGTATT
CAACAGTGGTGTCGTTGTGGCCATCCTGGGCGGATTGTTACCTGTGGTGTGCTCATACATATTTTTGTAA
TGTAGCTCTGCAGTGTGTCACGCGATATTCCGCTTGTCACCTGGAACGTGCCGGCGCATATTTCGTAGGA
TTGTAGCGTCGGCAGCCCATTCACGTCGGAAATTTTCCCGACATAGCATGCAGGAGGTCCATGCTATGTG
TAACTGTTATATGTGCTGGGGGCGCGAACGGTCGTGTCTGTGCTGTGTCTCGCTTGGCAATCGGATGTTA
CTAATGTCGACGCTAGTCCTCTCTCGGCAATACGACCTCGTATCACATGTGTGTATTCAGTTCTGGGGCG
TCTGTATTATAGTAGAGTGCTGTGAGCGATGTGGCAGCTATGGTATCATCTGTCACTGTTGTAAAGCTGT
CTGTGGTCTTCGGACGCAGCTCGTTTACGCCCTTCAGGTGTGCGGGCGATAAAACATTTTGATCCGTGTT
TTTTAAATACCAGCGTCAACTCGTTAGCATTAACCATCATCTGATTTTTTCTTTGGCTGGAATGGCCGGA
ACATTAATGTAGAACAGCGACTTTTTATTTTCAGGCAGGTTATCAGCTGTTTTATAATTCGCCAGGGATG
AGTCATCGCCAGCCTCAAGCGAAAATAGCGGTGGGGTTATAATAAATGCGGGCACGGCTTTTACCATCTA
TATTAT
>m120201_042231_42129_c100275262550000001523007907041260_s1_p0/13/0_68 [revcomp]
GCTGACTGAGGGTTGTGCACTTCACTTCGAACAGATGCTTGCCGTAGACTAGGCTCGCGTGTCGAGGC
>m120201_042231_42129_c100275262550000001523007907041260_s1_p0/13/117_519 [revcomp]
CATTTTTTTCTTTGGGTTGGTGGGAAAGCGTGTGGTTGGAGGACGGGCCGACATACTGGCCCGTTACTAT
GTCGACGCGGTGCGTGAATATTGTATGCTTGTGTGGCAACTAGCTGAGGGATTGAGTCATTCGCCTTGCG
CGTATTGCTTGTTGAAGTTGGCGGGGCCGGCCTGTTATTAGCGGGGGTCATCGCCACTGCGCGGCATGGT
GGAAGGGGCTTTGAGTAGGGTTTCGCCGAGCTCTTCAGCATCGCGTCTTTCGGTGAAAGCGGGCGCAATG
GCGACAGCTGTGTGTGTGATCCTACGCCGCGAATCGACGTGTCCGAAAATAGTAACTGTACGGCATGACT
TCGGCCGTGATGCTTCGGTGTTTAGAGCTTGTTGAAGTTGGGAGTCATGTCG
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment