Created
April 19, 2012 18:52
-
-
Save gregpinero/2422976 to your computer and use it in GitHub Desktop.
Randomly insert genetic barcodes (and weighted randomly chosen quality values) as the first N bp of each read in a FASTQ file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Randomly insert genetic barcodes (and weighted randomly chosen quality values) as the first N bp of each read in a FASTQ file. | |
Greg Pinero 2012 | |
""" | |
import sys | |
import argparse | |
import random | |
###### User Settings ####### | |
BARCODES=['AATAAG', | |
'CAGCCG', | |
'TAGGAG', | |
'CTAACG', | |
'GTATCG', | |
'GTTACG', | |
'CTTGCG', | |
'GTCCGG', | |
'GTCGCG', | |
'TTGACG',] | |
#How many times each Phred quality value to appear in your input file. | |
#(Run this program with the -q option to get a listing for your input file to paste here) | |
QUAL_VALS_FREQ = {'%': 6659, "'": 4594, '&': 17276, ')': 6952, '(': 19567, '+': 44609, '*': 68542, '-': 39654, ',': 45502, '/': 23120, '.': 52631, '1': 37193, | |
'0': 56738, '3': 33434, '2': 47743, '5': 35222, '4': 39743, '7': 31984, '6': 51680, '9': 43761, '8': 62523, ';': 55272, ':': 44746, '=': 73031, '<': 64683, | |
'?': 128232, '>': 79061, 'A': 89914, '@': 148618, 'C': 562118, 'B': 105488, 'E': 196772, 'D': 261132, 'F': 1855} | |
############################ | |
def weighted_choice(choices): | |
total = sum(w for c,w in choices) | |
r = random.uniform(0, total) | |
upto = 0 | |
for c, w in choices: | |
if upto+w > r: | |
return c | |
upto += w | |
assert False, "Shouldn't get here" | |
def main(argv=None): | |
if argv is None: | |
argv = sys.argv | |
parser = argparse.ArgumentParser( | |
description='Randomly insert barcodes (and weighted randomly chosen quality values) as the first N bp of each read.') | |
parser.add_argument('-q', action="store_true", default=False, dest="print_quality_weights", help="Print quality weights after run") | |
parser.add_argument('infile', help="Input FASTQ file") | |
args = parser.parse_args(argv[1:]) | |
qual_vals_freq = QUAL_VALS_FREQ.items() | |
lines_since_ref = 0 | |
reads=open(args.infile) | |
counts = {} | |
#Go through each line of the FASTQ file and print out modified output | |
for i,line in enumerate(reads): | |
if line.startswith('@') and lines_since_ref > 3: | |
#>3 condition so we don't mistake an @ char in the quality line | |
lines_since_ref = 0 | |
line = line.strip() | |
if lines_since_ref in (0,2): #On a ref or + line of FASTQ file | |
#Just output line as is, we won't change anything here. | |
print line | |
elif lines_since_ref == 1: #On a sequence line of FASTQ file | |
#Insert a random barocde in front of the line | |
print random.choice(BARCODES) + line | |
elif lines_since_ref == 3: #On a quality line of a FASTQ file | |
#Update tracked frequency of quality weights if the user has requested it | |
if args.print_quality_weights: | |
for char in line: | |
if char in counts: | |
counts[char]+=1 | |
else: | |
counts[char]=1 | |
#Randomly generate quality values for the inserted barcode weighted by their frequency | |
new_qual_vals = ''.join([weighted_choice(qual_vals_freq) for j in range(len(BARCODES[0]))]) | |
print new_qual_vals + line | |
else: | |
#There are only four types of lines in the FASTQ format, so this should never happen | |
#in a valid file | |
raise ValueError("invalid fastq format") | |
lines_since_ref += 1 | |
reads.close() | |
if args.print_quality_weights: | |
print "Frequency of quality values. You can replace QUAL_VALS_FREQ with these." | |
print counts | |
print "Sorted by frequency desc:" | |
print sorted(counts.items(), key=lambda x: -x[1]) | |
if __name__ == '__main__': | |
sys.exit(main()) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment