Skip to content

Instantly share code, notes, and snippets.

@walterst
Last active November 16, 2017 22:28
Show Gist options
  • Save walterst/98ded207e50802ced85b736a2f78319c to your computer and use it in GitHub Desktop.
Save walterst/98ded207e50802ced85b736a2f78319c to your computer and use it in GitHub Desktop.
Filters a barcode header to remove target characters, e.g. "+" character. Splits on target identifiers.
#!/usr/bin/env python
# Usage: python filter_barcode_header.py original_barcode_seqs.fastq new_barcode_seqs.fastq
# WARNING-the second file specified will be overwritten if it exists!
bc_start_indicator = "1:N:0:"
chars_to_strip = ["+"]
from sys import argv
from cogent.parse.fastq import MinimalFastqParser
from qiime.util import gzip_open
header_index = 0
sequence_index = 1
quality_index = 2
if argv[1].endswith('.gz'):
query_reads = gzip_open(argv[1])
else:
query_reads = open(argv[1], "U")
output_fastq = open(argv[2], "w")
for read_data in MinimalFastqParser(query_reads, strict=False):
curr_header = read_data[header_index].split(bc_start_indicator)
curr_bc = curr_header[1]
for char_to_strip in chars_to_strip:
curr_bc = curr_bc.replace(char_to_strip, "")
final_header = curr_header[0] + bc_start_indicator + curr_bc
curr_read = "@%s\n" % final_header
curr_read += "%s\n" % read_data[sequence_index]
curr_read += "+\n"
curr_read += "%s\n" % read_data[quality_index]
output_fastq.write(curr_read)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment