Skip to content

Instantly share code, notes, and snippets.

@gregpinero
Created April 19, 2012 18:52
Show Gist options
  • Save gregpinero/2422976 to your computer and use it in GitHub Desktop.
Save gregpinero/2422976 to your computer and use it in GitHub Desktop.
Randomly insert genetic barcodes (and weighted randomly chosen quality values) as the first N bp of each read in a FASTQ file.
"""
Randomly insert genetic barcodes (and weighted randomly chosen quality values) as the first N bp of each read in a FASTQ file.
Greg Pinero 2012
"""
import sys
import argparse
import random
###### User Settings #######
BARCODES=['AATAAG',
'CAGCCG',
'TAGGAG',
'CTAACG',
'GTATCG',
'GTTACG',
'CTTGCG',
'GTCCGG',
'GTCGCG',
'TTGACG',]
#How many times each Phred quality value to appear in your input file.
#(Run this program with the -q option to get a listing for your input file to paste here)
QUAL_VALS_FREQ = {'%': 6659, "'": 4594, '&': 17276, ')': 6952, '(': 19567, '+': 44609, '*': 68542, '-': 39654, ',': 45502, '/': 23120, '.': 52631, '1': 37193,
'0': 56738, '3': 33434, '2': 47743, '5': 35222, '4': 39743, '7': 31984, '6': 51680, '9': 43761, '8': 62523, ';': 55272, ':': 44746, '=': 73031, '<': 64683,
'?': 128232, '>': 79061, 'A': 89914, '@': 148618, 'C': 562118, 'B': 105488, 'E': 196772, 'D': 261132, 'F': 1855}
############################
def weighted_choice(choices):
total = sum(w for c,w in choices)
r = random.uniform(0, total)
upto = 0
for c, w in choices:
if upto+w > r:
return c
upto += w
assert False, "Shouldn't get here"
def main(argv=None):
if argv is None:
argv = sys.argv
parser = argparse.ArgumentParser(
description='Randomly insert barcodes (and weighted randomly chosen quality values) as the first N bp of each read.')
parser.add_argument('-q', action="store_true", default=False, dest="print_quality_weights", help="Print quality weights after run")
parser.add_argument('infile', help="Input FASTQ file")
args = parser.parse_args(argv[1:])
qual_vals_freq = QUAL_VALS_FREQ.items()
lines_since_ref = 0
reads=open(args.infile)
counts = {}
#Go through each line of the FASTQ file and print out modified output
for i,line in enumerate(reads):
if line.startswith('@') and lines_since_ref > 3:
#>3 condition so we don't mistake an @ char in the quality line
lines_since_ref = 0
line = line.strip()
if lines_since_ref in (0,2): #On a ref or + line of FASTQ file
#Just output line as is, we won't change anything here.
print line
elif lines_since_ref == 1: #On a sequence line of FASTQ file
#Insert a random barocde in front of the line
print random.choice(BARCODES) + line
elif lines_since_ref == 3: #On a quality line of a FASTQ file
#Update tracked frequency of quality weights if the user has requested it
if args.print_quality_weights:
for char in line:
if char in counts:
counts[char]+=1
else:
counts[char]=1
#Randomly generate quality values for the inserted barcode weighted by their frequency
new_qual_vals = ''.join([weighted_choice(qual_vals_freq) for j in range(len(BARCODES[0]))])
print new_qual_vals + line
else:
#There are only four types of lines in the FASTQ format, so this should never happen
#in a valid file
raise ValueError("invalid fastq format")
lines_since_ref += 1
reads.close()
if args.print_quality_weights:
print "Frequency of quality values. You can replace QUAL_VALS_FREQ with these."
print counts
print "Sorted by frequency desc:"
print sorted(counts.items(), key=lambda x: -x[1])
if __name__ == '__main__':
sys.exit(main())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment