Skip to content

Instantly share code, notes, and snippets.

@lpantano
Last active March 18, 2019 18:50
Show Gist options
  • Save lpantano/2a8d5b14fa6f5df7be3b68c006ef729d to your computer and use it in GitHub Desktop.
Save lpantano/2a8d5b14fa6f5df7be3b68c006ef729d to your computer and use it in GitHub Desktop.
detect barcodes for inDrop file
# Read first X lines of R3 of inDrop fastq files and map to
# sequence barcodes to check if the top N match or not.
import argparse
import os [44/1831]
import gzip
from collections import Counter, defaultdict
from Bio.Seq import Seq
def open_fastq(in_file):
""" open a fastq file, using gzip if it is gzipped
from bcbio package
"""
_, ext = os.path.splitext(in_file)
if ext == ".gz":
return gzip.open(in_file, 'rt')
if ext in [".fastq", ".fq", ".fasta", ".fa"]:
return open(in_file, 'r')
return ValueError("File needs to be fastq|fasta|fq|fa [.gz]")
def reverse_complement(seq):
"""Get reverse complement of a sequences
Args:
*seq(str)*: sequence.
>>> GCAT
Returns:
*(str)*: reverse complemente sequence:
>>> ATGC
"""
return str(Seq(seq).reverse_complement())
parser= argparse.ArgumentParser()
parser.add_argument("--fastq",
help="File with R3 reads of inDrops.", required=True)
parser.add_argument("--barcodes",
help="CSV file with barcodes, samplename.", required=True)
parser.add_argument("--lines",
help="Lines to read.", default=50000, type=int)
parser.add_argument("--top",
help="top N reads. Deafult same numbers than barcodes.", type=int)
args = parser.parse_args()
barcodes = {}
reverse = defaultdict()
with open(args.barcodes) as handle:
for line in handle:
columns = line.strip().split(",")
if len(columns) == 1:
columns[1] = columns[0]
barcodes[columns[0]] = columns[1]
used = reverse_complement(columns[0])
reverse[used] = columns[0]
print("barcode %s -> reverse %s (%s)" % (columns[0], used, columns[1]))
top = len(reverse) if args.top is None else args.top
print("Reading top %s barcodes:" % top)
read = 0
detected = Counter()
with open_fastq(args.fastq) as handle:
for line in handle:
if read > args.lines:
break
if line.startswith("@"):
read += 1
seq = handle.readline().strip()
# cheack if NT position equal to barcodes
detected[seq] += 1
# print known barcodes:
for barcode in reverse:
if barcode in detected:
sample = barcodes[reverse[barcode]]
print("This barcode %s (%s) is detected with %s reads." % (barcode, sample, detected[barcode]))
else:
print("This barcode %s has not been detected." % barcode)
# print detected but not in known list
read = 0
detected_sorted = sorted(detected, key=detected.get, reverse=True)
for nts in detected_sorted:
if nts not in reverse:
print("This barcode %s is not in your list (%s reads)." % (nts, detected[nts]))
read += 1
if read > top:
break
# write bcbio file
outfile = "%s-bcbio.txt" % os.path.splitext(args.barcodes)[0]
print("bcbio file is %s" % outfile)
with open(outfile, 'w') as outh:
for barcode in reverse:
if barcode in detected:
sample = barcodes[reverse[barcode]]
outh.write("%s,%s\n" % (barcode, sample))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment