USAGE: extract_fastq_barcodes_from_header.py input_reads.fastq barcode_reads.fastq
Created
November 17, 2012 03:28
-
-
Save gregcaporaso/4093047 to your computer and use it in GitHub Desktop.
quick and dirty script to create a barcode read fastq file from a sequence read fastq file with barcodes in the headers
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# File created on 16 Nov 2012 | |
from __future__ import division | |
__author__ = "Greg Caporaso" | |
__credits__ = ["Greg Caporaso"] | |
__license__ = "GPL" | |
__version__ = "1.5.0-dev" | |
__maintainer__ = "Greg Caporaso" | |
__email__ = "[email protected]" | |
__status__ = "Development" | |
from sys import argv | |
from unittest import TestCase, main as test_main | |
def script_main(input_f): | |
for line in input_f: | |
if line.startswith('@'): | |
header = line.strip() | |
barcode = header.split(':')[-1] | |
yield '%s\n%s\n+\nbbbbbbbbbbbb' % (header,barcode) | |
else: | |
pass | |
class ScriptTests(TestCase): | |
def setUp(self): | |
self.fake_file = fake_file.split('\n') | |
self.expected_output = expected_output | |
def test_main(self): | |
""" expected barcode fastq is generated """ | |
self.assertEqual('\n'.join(script_main(self.fake_file)), | |
self.expected_output) | |
fake_file = """@M141:79:749142:1:1101:16224:1417 1:N:0:CGACTAATGTGT | |
TACGTAGGTGGCAAGCGTTAGCCGGAATTATTGGGCGTAAAGCGCGAGTAGGCGGTTTTTTAAGTCTGATGTGAAAGCCCACGGCTCAACCGTGGAGGGTCATTGGACACTGTAAAACTTGAGTGCAGAAGAGGAAAGTGGAATTCCATGTGTAGCGGTGAAATGCGCAGAGATATGGAGGAACACCAGTAGCTAAGTCTACTTTCTGGACTGTAACTGACGCTGAGCTGCGACAGCGTGGGGATCAAACA | |
+ | |
=9<==>9+--,55<@@EEEC+8AC+CCE-AAA.ACCCCCDAFFEAC>C555--*55<+55C+DDEDE3=C==4444+44@D@A33<@D@DE)@0@############################################################################################################################################################ | |
@M141:79:749142:1:1101:16633:1423 1:N:0:CGACTTATGTGA | |
TACGTAGGTGGCAAGCGTTATCCGGAAGTATTGGGCGTAAAGCGCGCGTAGGCGGTTTTTTAAGTCTGATGTGAAAGCCCACGGCTCAACCGTGGAGGGTCATTGGAAACTGGAAAACTTGAGTGCAGAAGAGGAAAGTGGAATTCCATGTGTAGCTGTGACATGCGCAGAGATATGGAGACACACCAGTGGCGAAGGCGACGTTCTGGTCTGTAACTGACGCTGATGTGCACAAGCGTGGGGATCACACA | |
+ | |
,5<==>>+<@@<@<@@E6+>+AACA=C+8ADAF=EE7>CEEF@ECCD>5>CEEDACCD5<CE@EDEEEEEDE@@@:+4@DDD==@@:2;@98@8::2296<E(;;6<EE<E;(66;(/;?66;<;<<E;?/.///96<EEE(6;?<?=E###################################################################################################### | |
@M141:79:749142:1:1101:15549:1428 1:N:0:CGACTTATGTGT | |
TACGTAGGTGGCAAGCGTTATCCGGAATTATTGGGCGTAAAGCGCGCGTAGGCGGTTTGTTAAGTCTGATGTGAAAGCCCACGGCTCAACCGTGGAGGGTCATTGGAAACTTGAAAACTTGAGTGCAGAAGAGGAAAGTGGAATTCCATGTGTAGCGGTGAAATGCGCAGAGATATGGAGGAACATCAGTGGCGAAGGCGACTTGCTCGTCTGTAACTGACGCTGATGTGCGAAAGCGTGGTGATCCATCA | |
+ | |
5<???BB?B?<5?B,<CC6CC8ECCEAFBEAA09A7>C>CCFG:D7>C>>>E=CD<<5+4CFFFFDFCDFFFF?@D,4DFD@@>>@DDEEE<A;<***1:;B;;?ACAEA?*0:*0?CC//::0A:AACA*008?EE8::::??/:0:A??AAC?################################################################################################ | |
@M141:79:749142:1:1101:16736:1437 1:N:0:CGACTTATGTAT | |
TACGTAGGTGGCAAGCGTTATCCGGAATTATTGGGCGTAAAGCGCGCGTAGGCGGTTTTTTAAGTCTGATGTGAAAGCCCACGGCTCAACCGTGGAGGGTCATTGGAAACTGGAAAACTTGAGTGCAGAAGAGGAAAGTGGAATTCCATGTGTCGCGGTGAAATGCGCAGAGATATGGAGGCACACCAGTGGCGAAGGCGACCTACTGGTCTGTTACTGACGCTGATGTGCGCAAGCTAGAGGATCAAACA | |
+ | |
<???,5?9BB9<?-ABC@>C;CFA=CEHHHFFFGHF+>CEHHHHHHHHHHAEHH7CCD+DCCCFHDDBD@444B,??DFFFEEEEEEE,;C@B;:B**28AEEE*C:*0:?:*0:AECAE*0??E*00*:?**0/*0**0?EEECEEC:A:*/*08'2?')00??EAA?820:A############################################################################# | |
""" | |
expected_output = """@M141:79:749142:1:1101:16224:1417 1:N:0:CGACTAATGTGT | |
CGACTAATGTGT | |
+ | |
bbbbbbbbbbbb | |
@M141:79:749142:1:1101:16633:1423 1:N:0:CGACTTATGTGA | |
CGACTTATGTGA | |
+ | |
bbbbbbbbbbbb | |
@M141:79:749142:1:1101:15549:1428 1:N:0:CGACTTATGTGT | |
CGACTTATGTGT | |
+ | |
bbbbbbbbbbbb | |
@M141:79:749142:1:1101:16736:1437 1:N:0:CGACTTATGTAT | |
CGACTTATGTAT | |
+ | |
bbbbbbbbbbbb | |
""" | |
if __name__ == "__main__": | |
if len(argv) == 1: | |
print "USAGE: extract_fastq_barcodes_from_header.py input_reads.fastq barcode_reads.fastq" | |
print "\nTest output:\n" | |
test_main() | |
else: | |
output_f = open(argv[2],'w') | |
for rec in script_main(open(argv[1],'U')): | |
output_f.write(rec) | |
output_f.write('\n') | |
output_f.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment