Skip to content

Instantly share code, notes, and snippets.

@gregcaporaso
Last active December 19, 2015 05:18
Show Gist options
  • Save gregcaporaso/5902838 to your computer and use it in GitHub Desktop.
Save gregcaporaso/5902838 to your computer and use it in GitHub Desktop.
Code for demultiplexing fastq data where index reads and barcodes are included in the beginning of sequences. This code depends on QIIME 1.7.0.

Code for demultiplexing fastq data where index reads and barcodes are included in the beginning of sequences. This code depends on QIIME 1.7.0.

To run this code and pass the results into split_libraries_fastq.py:

prep_sl_fastq.py -b AmpF_25k.fastq.gz -m mapping.txt -o prepped_fastq
cd prepped_fastq
split_libraries_fastq.py -i AmpF_25k.fastq.amplicon.fastq -b AmpF_25k.fastq.barcode.fastq -m ../mapping.txt -o slout/ --barcode_type 12

Limitations of this code:

  • There is no error-correction of barcodes. Only perfect barcode reads will be included in the results.
#!/usr/bin/env python
# File created on 01 Jul 2013
from __future__ import division
__author__ = "Greg Caporaso"
__copyright__ = "Copyright 2013, The QIIME project"
__credits__ = ["Greg Caporaso"]
__license__ = "GPL"
__version__ = "0.0.0"
__maintainer__ = "Greg Caporaso"
__email__ = "[email protected]"
__status__ = "Development"
from os.path import join, split, splitext
from qiime.util import (parse_command_line_parameters,
make_option)
from cogent import DNA
from cogent.parse.fastq import MinimalFastqParser
from qiime.util import qiime_open, create_dir
from qiime.parse import parse_mapping_file_to_dict
script_info = {}
script_info['brief_description'] = "Prepare fastq reads to be run through split_libraries_fastq.py."
script_info['script_description'] = ""
script_info['script_usage'] = []
script_info['script_usage'].append(("","",""))
script_info['output_description']= ""
script_info['required_options'] = [
make_option('-b','--barcoded_read_fp',
type="existing_filepath",
help='path to fastq file containing the non-barcoded reads'),
make_option('-m','--mapping_fp',
type="existing_filepath",
help='path to metadata mapping file'),
make_option('-o','--output_dir',
type="new_dirpath",
help='the output directory'),
]
script_info['optional_options'] = [
]
script_info['version'] = __version__
def compute_barcode_length(barcode_map):
barcode_lengths = [len(bc) for bc in barcode_map]
barcode_length = barcode_lengths[0]
assert len(set(barcode_lengths)) == 1,\
"All barcodes must be the same length."
return barcode_length
def strip_barcode_and_linker_primer(fastq,
barcode_map):
"""
strip barcodes & linker-primer seqs; return barcode & amplicon seqs & quals
fastq: open file handle contained barcoded fastq data
barcode_map: dict mapping barcodes to the length of the associated
linker+primer sequences
yields header, barcode_read, barcode_qual, amplicon_read, amplicon_qual
"""
barcode_length = compute_barcode_length(barcode_map)
for header, seq, qual in MinimalFastqParser(fastq,strict=False):
barcode_read = seq[:barcode_length]
barcode_qual = qual[:barcode_length]
try:
linker_primer_length = barcode_map[barcode_read]
except KeyError:
continue
barcode_linker_primer_length = barcode_length + linker_primer_length
amplicon_read = seq[barcode_linker_primer_length:]
amplicon_qual = qual[barcode_linker_primer_length:]
yield header, barcode_read, barcode_qual, amplicon_read, amplicon_qual
def main():
option_parser, opts, args =\
parse_command_line_parameters(**script_info)
fastq = qiime_open(opts.barcoded_read_fp)
mapping_data, comments = parse_mapping_file_to_dict(qiime_open(opts.mapping_fp))
barcode_map = {}
for k, v in mapping_data.items():
barcode_map[v['BarcodeSequence']] = len(v['LinkerPrimerSequence'])
input_basename = split(splitext(opts.barcoded_read_fp)[0])[1]
output_amplicon_fn = '%s.amplicon.fastq' % input_basename
output_barcode_fn = '%s.barcode.fastq' % input_basename
output_amplicon_fp = join(opts.output_dir,output_amplicon_fn)
output_barcode_fp = join(opts.output_dir,output_barcode_fn)
create_dir(opts.output_dir)
output_amplicon_f = open(output_amplicon_fp,'w')
output_barcode_f = open(output_barcode_fp,'w')
for header, barcode_read, barcode_qual, amplicon_read, amplicon_qual in \
strip_barcode_and_linker_primer(fastq,barcode_map):
output_barcode_f.write(
'@%s\n%s\n+\n%s\n' % (header, barcode_read, barcode_qual))
output_amplicon_f.write(
'@%s\n%s\n+\n%s\n' % (header, amplicon_read, amplicon_qual))
if __name__ == "__main__":
main()
#!/usr/bin/env python
# File created on 01 Jul 2013
from __future__ import division
__author__ = "Greg Caporaso"
__copyright__ = "Copyright 2011, The QIIME project"
__credits__ = ["Greg Caporaso"]
__license__ = "GPL"
__version__ = "1.7.0-dev"
__maintainer__ = "Greg Caporaso"
__email__ = "[email protected]"
__status__ = "Development"
from shutil import rmtree
from os.path import exists, join
from cogent.util.unit_test import TestCase, main
from cogent.util.misc import remove_files, create_dir
from qiime.util import (get_qiime_temp_dir,
get_tmp_filename)
from qiime.test import initiate_timeout, disable_timeout
from prep_sl_fastq import (compute_barcode_length,
strip_barcode_and_linker_primer)
class PrepSlFastqTests(TestCase):
def setUp(self):
self.barcode_map1 = {'ACCT':3,
'ACCG':4,
'GCCG':5,
'GCCT':3}
self.barcode_map2 = {'GGGGGGGGACCT':3,
'GGGGGGGGACCG':4,
'GGGGGGGGGCCG':5,
'GGGGGGGGGCCG':3}
self.fastq1 = fastq1.split('\n')
self.fastq1_expected_output = fastq1_expected_output
def test_compute_barcode_length(self):
"""compute_barcode_length functions as expected
"""
self.assertEqual(compute_barcode_length(self.barcode_map1),4)
self.assertEqual(compute_barcode_length(self.barcode_map2),12)
self.assertRaises(AssertionError,compute_barcode_length,{'A':5,'AA':6})
def test_strip_barcode_and_linker_primer(self):
"""strip_barcode_and_linker_primer functions as expected"""
self.assertEqual(list(strip_barcode_and_linker_primer(self.fastq1,self.barcode_map1)),
self.fastq1_expected_output)
fastq1 = """@ACB045:20:000000000-A3T9K:1:1101:16852:1690 1:N:0:CGATGT
ACCTGGGTACGGGGGTATCTAATCCCGTTTGCTACTCAAGCTCTCGTCCATCACCGTCAGAAATATCATAGTTAGCTGCTTTCGCTTTCGGCGTTCTTTCCGATATCAACGCATTTCACCGCTCCACCGGAAATTCCACTAACCTCCAATATTCTCTAGATATAAAGTATCTATTCCCTTTCCACGATTGAGTCGTGGTATTTGAAAACAGACTTTTATATCGGGCTACGGACACTTTACGCCCAGTAAATCCGGAT
+
BBBBBBBABB@ADBDDAFFG55FGGGGCCGFHHGHCHHHHHHHFHDGGGGHHFHHGHGGGHHHHHHGHHHGHF5GHHHHHFHH1EAFGHGCGGGGFGGHHFGGGGGGGHFDGCGGGHHGHGGGGGFHA/<ACGHHHHDHHHHHHFHHHHHHHEHHHHHHFHGHGBFFHGCHHHHFFGGHHHGHHHGGGG?FB0;BED.9BFFFFGFGGFGFFGGFGGGFFFFEB-;;AB.:A@?FF/FB9.A--99FBFFFFFD;DD
@ACB045:20:000000000-A3T9K:1:1101:15842:1692 1:N:0:CGATGT
GCCGGGGGGCTGAAGGGCGAACCGTAAAACGACGGCCAGCACCGCCGCGGTAATACGAAGGTGGCAAGCGTTGTTCGGAATCACTGGGCTTAAAGAGCACGTAGGCGGAGATGTAAGTGTCTTGTGAAATCCCTCGGCTTAACCGAGGAATTGTTGGGCAAACCACATCTCTTGAGGCAAGTAGGGGTGTGTGGAACTCTTGGTGGAGCGGTGGAATGCGTAGATATCAAGAGGAACGCCGAAGGTGAAGACAGCACAC
+
ABBBBBBBBBBBBBFFBBBBBGGGAGFGGGFGGGGGEGHGHHGGGGGGGGGGGHHHHFHFGFHGHHHGHGGGGFHHGGGGGHHHHHHHHHHHHHHHHGHGGHHHGGFGGGGHHHHHGFGGHHHHHHHHHHHGHHGGGGGGHHHHGGGGCGGGGGGGGGGGFGGGGGGGGGGGGGBFFEEFEBFFFFFDFFFFFFBFFFFFFBFFFFFFFFFDBDFFBFFF;DDFFFFFFFFFFFFFFDFFFDFFF/FFFFFFFFEFFFA
@ACB045:20:000000000-A3T9K:1:1101:17415:1693 1:N:0:CGATGT
ACCTGGGTACCGGGGTATCTAATCCTGTTTGCTCCCCACGCTTTCGTGCCTCAGCGTCAGTGTCGTTCCAGGAGGCCGCCTTCGCCACCGGTGTTCCTCCGAATATCTACGCATTTCACTGCTACACCAGGAATTCCACCTCCCTCTCCCACACTCAAGTCTGGCAGTCAACACGGCAGTCACAAGGTTAAGCCCTGCGATTTACCCTCCGATTTAAAAACCCGCCTACGACCCCTTTAGGCCCAGTAAACCCGATA
+
BBBBBBBDCDDDDDDCCFFGGGGGGGGGGHGHHHHGGFGGGGGGHGHGFHH3FB3EGGFGH555AB1B?BG3G1>E1>FGGGHHGGD1?EGGGFGHHHHH//?FCG22?2FCGGGHHHH2@FHHHDG2/@/FFHFHH1F=FCHGHHG0<<0<<E<G/CG0=00/0<EHC0000..:@@-;:0:0;AAGFF0;BF./;.--CE.:0CB0.9:-.;9:B:FF.A.-:9.9B...--9;A0000;FAF.0;:00/9@>--
"""
fastq1_expected_output = [
("ACB045:20:000000000-A3T9K:1:1101:16852:1690 1:N:0:CGATGT",
"ACCT",
"BBBB",
"TACGGGGGTATCTAATCCCGTTTGCTACTCAAGCTCTCGTCCATCACCGTCAGAAATATCATAGTTAGCTGCTTTCGCTTTCGGCGTTCTTTCCGATATCAACGCATTTCACCGCTCCACCGGAAATTCCACTAACCTCCAATATTCTCTAGATATAAAGTATCTATTCCCTTTCCACGATTGAGTCGTGGTATTTGAAAACAGACTTTTATATCGGGCTACGGACACTTTACGCCCAGTAAATCCGGAT",
"ABB@ADBDDAFFG55FGGGGCCGFHHGHCHHHHHHHFHDGGGGHHFHHGHGGGHHHHHHGHHHGHF5GHHHHHFHH1EAFGHGCGGGGFGGHHFGGGGGGGHFDGCGGGHHGHGGGGGFHA/<ACGHHHHDHHHHHHFHHHHHHHEHHHHHHFHGHGBFFHGCHHHHFFGGHHHGHHHGGGG?FB0;BED.9BFFFFGFGGFGFFGGFGGGFFFFEB-;;AB.:A@?FF/FB9.A--99FBFFFFFD;DD"),
("ACB045:20:000000000-A3T9K:1:1101:15842:1692 1:N:0:CGATGT",
"GCCG",
"ABBB",
"CTGAAGGGCGAACCGTAAAACGACGGCCAGCACCGCCGCGGTAATACGAAGGTGGCAAGCGTTGTTCGGAATCACTGGGCTTAAAGAGCACGTAGGCGGAGATGTAAGTGTCTTGTGAAATCCCTCGGCTTAACCGAGGAATTGTTGGGCAAACCACATCTCTTGAGGCAAGTAGGGGTGTGTGGAACTCTTGGTGGAGCGGTGGAATGCGTAGATATCAAGAGGAACGCCGAAGGTGAAGACAGCACAC",
"BBBBBFFBBBBBGGGAGFGGGFGGGGGEGHGHHGGGGGGGGGGGHHHHFHFGFHGHHHGHGGGGFHHGGGGGHHHHHHHHHHHHHHHHGHGGHHHGGFGGGGHHHHHGFGGHHHHHHHHHHHGHHGGGGGGHHHHGGGGCGGGGGGGGGGGFGGGGGGGGGGGGGBFFEEFEBFFFFFDFFFFFFBFFFFFFBFFFFFFFFFDBDFFBFFF;DDFFFFFFFFFFFFFFDFFFDFFF/FFFFFFFFEFFFA"),
("ACB045:20:000000000-A3T9K:1:1101:17415:1693 1:N:0:CGATGT",
"ACCT",
"BBBB",
"TACCGGGGTATCTAATCCTGTTTGCTCCCCACGCTTTCGTGCCTCAGCGTCAGTGTCGTTCCAGGAGGCCGCCTTCGCCACCGGTGTTCCTCCGAATATCTACGCATTTCACTGCTACACCAGGAATTCCACCTCCCTCTCCCACACTCAAGTCTGGCAGTCAACACGGCAGTCACAAGGTTAAGCCCTGCGATTTACCCTCCGATTTAAAAACCCGCCTACGACCCCTTTAGGCCCAGTAAACCCGATA",
"DCDDDDDDCCFFGGGGGGGGGGHGHHHHGGFGGGGGGHGHGFHH3FB3EGGFGH555AB1B?BG3G1>E1>FGGGHHGGD1?EGGGFGHHHHH//?FCG22?2FCGGGHHHH2@FHHHDG2/@/FFHFHH1F=FCHGHHG0<<0<<E<G/CG0=00/0<EHC0000..:@@-;:0:0;AAGFF0;BF./;.--CE.:0CB0.9:-.;9:B:FF.A.-:9.9B...--9;A0000;FAF.0;:00/9@>--"),
]
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment