Last active
March 18, 2019 18:50
-
-
Save lpantano/2a8d5b14fa6f5df7be3b68c006ef729d to your computer and use it in GitHub Desktop.
detect barcodes for inDrop file
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Read first X lines of R3 of inDrop fastq files and map to | |
# sequence barcodes to check if the top N match or not. | |
import argparse | |
import os [44/1831] | |
import gzip | |
from collections import Counter, defaultdict | |
from Bio.Seq import Seq | |
def open_fastq(in_file): | |
""" open a fastq file, using gzip if it is gzipped | |
from bcbio package | |
""" | |
_, ext = os.path.splitext(in_file) | |
if ext == ".gz": | |
return gzip.open(in_file, 'rt') | |
if ext in [".fastq", ".fq", ".fasta", ".fa"]: | |
return open(in_file, 'r') | |
return ValueError("File needs to be fastq|fasta|fq|fa [.gz]") | |
def reverse_complement(seq): | |
"""Get reverse complement of a sequences | |
Args: | |
*seq(str)*: sequence. | |
>>> GCAT | |
Returns: | |
*(str)*: reverse complemente sequence: | |
>>> ATGC | |
""" | |
return str(Seq(seq).reverse_complement()) | |
parser= argparse.ArgumentParser() | |
parser.add_argument("--fastq", | |
help="File with R3 reads of inDrops.", required=True) | |
parser.add_argument("--barcodes", | |
help="CSV file with barcodes, samplename.", required=True) | |
parser.add_argument("--lines", | |
help="Lines to read.", default=50000, type=int) | |
parser.add_argument("--top", | |
help="top N reads. Deafult same numbers than barcodes.", type=int) | |
args = parser.parse_args() | |
barcodes = {} | |
reverse = defaultdict() | |
with open(args.barcodes) as handle: | |
for line in handle: | |
columns = line.strip().split(",") | |
if len(columns) == 1: | |
columns[1] = columns[0] | |
barcodes[columns[0]] = columns[1] | |
used = reverse_complement(columns[0]) | |
reverse[used] = columns[0] | |
print("barcode %s -> reverse %s (%s)" % (columns[0], used, columns[1])) | |
top = len(reverse) if args.top is None else args.top | |
print("Reading top %s barcodes:" % top) | |
read = 0 | |
detected = Counter() | |
with open_fastq(args.fastq) as handle: | |
for line in handle: | |
if read > args.lines: | |
break | |
if line.startswith("@"): | |
read += 1 | |
seq = handle.readline().strip() | |
# cheack if NT position equal to barcodes | |
detected[seq] += 1 | |
# print known barcodes: | |
for barcode in reverse: | |
if barcode in detected: | |
sample = barcodes[reverse[barcode]] | |
print("This barcode %s (%s) is detected with %s reads." % (barcode, sample, detected[barcode])) | |
else: | |
print("This barcode %s has not been detected." % barcode) | |
# print detected but not in known list | |
read = 0 | |
detected_sorted = sorted(detected, key=detected.get, reverse=True) | |
for nts in detected_sorted: | |
if nts not in reverse: | |
print("This barcode %s is not in your list (%s reads)." % (nts, detected[nts])) | |
read += 1 | |
if read > top: | |
break | |
# write bcbio file | |
outfile = "%s-bcbio.txt" % os.path.splitext(args.barcodes)[0] | |
print("bcbio file is %s" % outfile) | |
with open(outfile, 'w') as outh: | |
for barcode in reverse: | |
if barcode in detected: | |
sample = barcodes[reverse[barcode]] | |
outh.write("%s,%s\n" % (barcode, sample)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment