Created
April 29, 2015 19:58
-
-
Save ebolyen/7e7286e704b1934191d0 to your computer and use it in GitHub Desktop.
skbio - fastq reading-with-vectorized
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"from itertools import islice\n", | |
"from skbio import io, DNASequence, SequenceCollection\n", | |
"\n", | |
"fastq_path = './Undetermined_S0_L001_R2_001.fastq'" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Pure skbio" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"10 loops, best of 3: 48.2 ms per loop\n" | |
] | |
} | |
], | |
"source": [ | |
"%%timeit\n", | |
"for read in islice(io.read(fastq_path, format='fastq', verify=False, \n", | |
" variant='illumina1.8', constructor=DNASequence), 0, 1000):\n", | |
" pass" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Simple reader" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"from itertools import chain, repeat\n", | |
"\n", | |
"def grouper(n, iterable, pad_value=None):\n", | |
" \"\"\" Group iterable into chunks of n.\n", | |
"\n", | |
" grouper(3, 'abcdefg', 'x') --> ('a','b','c'), ('d','e','f'), ('g','x','x')\"\n", | |
"\n", | |
" :param n:\n", | |
" :param iterable:\n", | |
" :param pad_value:\n", | |
" :return:\n", | |
" \"\"\"\n", | |
" return zip(*[chain(iterable, repeat(pad_value, n-1))]*n)\n", | |
"\n", | |
"def read_fastq(file_path):\n", | |
" with open(file_path, 'r') as file_:\n", | |
" for id_, seq, _, qual in grouper(4, file_):\n", | |
" yield FastqSequence(id=id_.strip(),\n", | |
" sequence=seq.strip(),\n", | |
" quality=qual.strip()) \n", | |
" \n", | |
"class FastqSequence(object):\n", | |
" \n", | |
" def __init__(self, sequence, id, quality):\n", | |
" self.sequence = sequence\n", | |
" self.id = id\n", | |
" self.quality = quality" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"100 loops, best of 3: 2.08 ms per loop\n" | |
] | |
} | |
], | |
"source": [ | |
"%%timeit\n", | |
"for read in islice(read_fastq(fastq_path), 0, 1000):\n", | |
" pass" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false, | |
"scrolled": true | |
}, | |
"outputs": [], | |
"source": [ | |
"for simple, skb in zip(islice(read_fastq(fastq_path), 0, 1000), islice(io.read(fastq_path, format='fastq', verify=False, \n", | |
" variant='illumina1.8', constructor=DNASequence), 0, 1000)):\n", | |
" print(simple.sequence == str(skb))" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## With conversion" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"from skbio.io._base import _decode_qual_to_phred\n", | |
"\n", | |
"def read_fastq_convert(file_path):\n", | |
" with open(file_path, 'r') as file_:\n", | |
" for id_, seq, _, qual in grouper(4, file_):\n", | |
" quality = _decode_qual_to_phred(qual.strip(), variant='illumina1.8')\n", | |
" yield FastqSequence(id=id_.strip(),\n", | |
" sequence=seq.strip(),\n", | |
" quality=quality) " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"100 loops, best of 3: 15.7 ms per loop\n" | |
] | |
} | |
], | |
"source": [ | |
"%%timeit\n", | |
"for read in islice(read_fastq_convert(fastq_path), 0, 1000):\n", | |
" pass" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## With DNASequence" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def read_fastq_convert_dna(file_path):\n", | |
" with open(file_path, 'r') as file_:\n", | |
" for id_, seq, _, qual in grouper(4, file_):\n", | |
" quality = _decode_qual_to_phred(qual.strip(), variant='illumina1.8')\n", | |
" yield DNASequence(id=id_.strip(),\n", | |
" sequence=seq.strip(),\n", | |
" quality=quality) " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"10 loops, best of 3: 26.5 ms per loop\n" | |
] | |
} | |
], | |
"source": [ | |
"%%timeit\n", | |
"for read in islice(read_fastq_convert_dna(fastq_path), 0, 1000):\n", | |
" pass" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.4.0" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment