Last active
November 16, 2017 22:28
-
-
Save walterst/98ded207e50802ced85b736a2f78319c to your computer and use it in GitHub Desktop.
Filters a barcode header to remove target characters, e.g. "+" character. Splits on target identifiers.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# Usage: python filter_barcode_header.py original_barcode_seqs.fastq new_barcode_seqs.fastq | |
# WARNING-the second file specified will be overwritten if it exists! | |
bc_start_indicator = "1:N:0:" | |
chars_to_strip = ["+"] | |
from sys import argv | |
from cogent.parse.fastq import MinimalFastqParser | |
from qiime.util import gzip_open | |
header_index = 0 | |
sequence_index = 1 | |
quality_index = 2 | |
if argv[1].endswith('.gz'): | |
query_reads = gzip_open(argv[1]) | |
else: | |
query_reads = open(argv[1], "U") | |
output_fastq = open(argv[2], "w") | |
for read_data in MinimalFastqParser(query_reads, strict=False): | |
curr_header = read_data[header_index].split(bc_start_indicator) | |
curr_bc = curr_header[1] | |
for char_to_strip in chars_to_strip: | |
curr_bc = curr_bc.replace(char_to_strip, "") | |
final_header = curr_header[0] + bc_start_indicator + curr_bc | |
curr_read = "@%s\n" % final_header | |
curr_read += "%s\n" % read_data[sequence_index] | |
curr_read += "+\n" | |
curr_read += "%s\n" % read_data[quality_index] | |
output_fastq.write(curr_read) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment