Skip to content

Instantly share code, notes, and snippets.

@brentp
Last active April 8, 2016 23:08
Show Gist options
  • Save brentp/f8688658bc95d6e22f99db97590786fa to your computer and use it in GitHub Desktop.
Save brentp/f8688658bc95d6e22f99db97590786fa to your computer and use it in GitHub Desktop.

save MC MQ tags to an LMDB

pip install lmdb

create a database

keys are qname///flag

vals are val is MC:...\tMQ:...

samblaster --addMateTags $bam | python lmdb-mc_mq.py $dbname

Also see run.sh

query a database

use same key ("%s///%d" % aln.qname, aln.flag in pysam)and value will be the tab-delimited MC and MQ tags.

import lmdb
import sys
import time
import zlib
db = sys.argv[1]
env = lmdb.open(db, readonly=False, create=True, map_size=2*2**32,
map_async=True, sync=False,
writemap=True)
SIZE = 1000000
cache = []
t00 = t0 = time.time()
txn = env.begin(write=True)
for i, line in enumerate(sys.stdin, start=1):
if line[0] == "@": continue
toks = line.rstrip("\n").split("\t")
# qname///flag
key = "%s///%s" % (toks[0], toks[1])
if toks[-1][:3] == "MQ:":
mq = toks[-1][5:]
mc = toks[-2][5:]
else:
tags = toks[11:]
mc = [x for x in tags if x[:3] == "MC:"][0][5:]
mq = [x for x in tags if x[:3] == "MQ:"][0][5:]
# val is MC:...\tMQ:...
cache.append((key, "%s\t%s" % (mc, mq)))
if i % SIZE == 0:
cursor = txn.cursor()
print i, time.time() - t00, time.time() - t0,
t0 = time.time()
cache.sort()
cursor.putmulti(cache)
print time.time() - t0
t0 = time.time()
cache = cache[:0]
cursor.close()
cursor = txn.cursor()
cache.sort()
cursor.putmulti(cache)
txn.commit()
print "done: wrote %d values" % i
set -e
sudo apt-get -qy install build-essential libsnappy-dev zlib1g-dev libbz2-dev libgflags-dev python-pip ipython
if [[ ! -e rocksdb-4.3.1 ]]; then
wget https://github.com/facebook/rocksdb/archive/v4.3.1.tar.gz
tar xzvf v4.3.1.tar.gz
fi
cd rocksdb-4.3.1
if [[ ! -e librocksdb.so.4.3.1 ]]; then
make shared_lib
fi
export CPLUS_INCLUDE_PATH=${CPLUS_INCLUDE_PATH}:`pwd`/include
export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:`pwd`
export LIBRARY_PATH=${LIBRARY_PATH}:`pwd`
cd
if [[ ! -d rocks-svtyper ]]; then
git clone https://gist.github.com/f8688658bc95d6e22f99db97590786fa.git rocks-svtyper
fi
pip install pyrocksdb --user
cd
rm -rf samblaster
git clone https://github.com/GregoryFaust/samblaster
cd samblaster
make
sudo cp samblaster /usr/local/bin
pip install pysam
import rocksdb
import sys
import time
# conda install -c https://conda.anaconda.org/activisiongamescience "rocksdb==4.1-0"
# conda install -c https://conda.anaconda.org/activisiongamescience pyrocksdb
opts = rocksdb.Options(create_if_missing=True,
max_open_files=300000,
write_buffer_size=67108864,
max_write_buffer_number=4,
target_file_size_base=267108864)
db = rocksdb.DB(sys.argv[1], opts)
SIZE = 1000000
cache = []
t00 = t0 = time.time()
batch = rocksdb.WriteBatch()
for i, line in enumerate(sys.stdin, start=1):
if line[0] == "@": continue
toks = line.rstrip("\n").split("\t")
# qname///flag
key = "%s///%s" % (toks[0], toks[1])
if toks[-1][:3] == "MQ:":
mq = toks[-1][5:]
mc = toks[-2][5:]
else:
tags = toks[11:]
mc = [x for x in tags if x[:3] == "MC:"][0][5:]
mq = [x for x in tags if x[:3] == "MQ:"][0][5:]
# val is MC:...\tMQ:...
batch.put(key, "%s\t%s" % (mc, mq))
if i % SIZE == 0:
print i, time.time() - t00, time.time() - t0,
db.write(batch)
print time.time() - t0
t0 = time.time()
batch = rocksdb.WriteBatch()
db.write(batch)
print "done: wrote %d values" % i
#!/bin/bash
set -o nounset -e -o pipefail
bgr=$HOME/lumpy-sv/scripts/bamkit/bamgroupreads.py
MAX_SPLIT_COUNT=2
MIN_NON_OVERLAP=20
if [ "$#" -eq 0 ]; then
bam=s3://sscwgs/14590/BAM/Sample_SSC12414/analysis/SSC12414.final.bam
else
bam=$1
fi
DBPATH=$(basename $bam .bam).rocksdb
if [ "$#" -eq 2 ]; then
BASE=$2/
DBPATH=$2/$DBPATH
else
BASE=$(dirname $DBPATH)
fi
pushd `dirname $0` > /dev/null
SCRIPTPATH=`pwd -P`
popd > /dev/null
P=$SCRIPTPATH
rm -f $BASE/split.pipe
rm -f $BASE/discordant.pipe
mkfifo $BASE/split.pipe
mkfifo $BASE/discordant.pipe
rm -rf $DBPATH
samtools view -b $bam \
| python $bgr \
| samblaster --addMateTags --acceptDupMarks --excludeDups \
--maxSplitCount $MAX_SPLIT_COUNT \
--minNonOverlap $MIN_NON_OVERLAP \
--splitterFile $BASE/split.pipe \
--discordantFile $BASE/discordant.pipe \
| python $P/rocksdb-mc_mq.py $DBPATH &
samtools view -bS $BASE/split.pipe | samtools sort -m1G -o $BASE/split.unsorted.bam - &
samtools view -bS $BASE/discordant.pipe | samtools sort -m1G -o $BASE/discordant.unsorted.bam - &
wait
rm $BASE/split.pipe
rm $BASE/discordant.pipe
#!/usr/bin/env python
import pysam
import argparse, sys
import math, time, re
from collections import Counter
from argparse import RawTextHelpFormatter
__author__ = "Colby Chiang ([email protected])"
__version__ = "$Revision: 0.0.4 $"
__date__ = "$Date: 2016-04-05 09:07 $"
# --------------------------------------
# define functions
def get_args():
parser = argparse.ArgumentParser(formatter_class=RawTextHelpFormatter, description="\
svtyper\n\
author: " + __author__ + "\n\
version: " + __version__ + "\n\
description: Compute genotype of structural variants based on breakpoint depth")
parser.add_argument('-B', '--bam', type=str, required=True, help='BAM file(s), comma-separated if genotyping multiple BAMs')
parser.add_argument('-S', '--split_bam', type=str, required=False, help='split-read bam file for sample, comma-separated if genotyping multiple BAMs')
parser.add_argument('-i', '--input_vcf', type=argparse.FileType('r'), default=None, help='VCF input (default: stdin)')
parser.add_argument('-o', '--output_vcf', type=argparse.FileType('w'), default=sys.stdout, help='output VCF to write (default: stdout)')
parser.add_argument('-f', '--splflank', type=int, required=False, default=20, help='min number of split read query bases flanking breakpoint on either side [20]')
parser.add_argument('-F', '--discflank', type=int, required=False, default=20, help='min number of discordant read query bases flanking breakpoint on either side. (should not exceed read length) [20]')
parser.add_argument('--split_weight', type=float, required=False, default=1, help='weight for split reads [1]')
parser.add_argument('--disc_weight', type=float, required=False, default=1, help='weight for discordant paired-end reads [1]')
parser.add_argument('-n', dest='num_samp', type=int, required=False, default=1000000, help='number of pairs to sample from BAM file for building insert size distribution [1000000]')
parser.add_argument('-M', action='store_true', dest='legacy', required=False, help='split reads are flagged as secondary, not supplementary. For compatibility with legacy BWA-MEM "-M" flag')
parser.add_argument('--debug', action='store_true', help='debugging verbosity')
parser.add_argument('--rocksdb', type=str, help='path to rocks-database')
# parse the arguments
args = parser.parse_args()
# if no input, check if part of pipe and if so, read stdin.
if args.input_vcf == None:
if sys.stdin.isatty():
parser.print_help()
exit(1)
else:
args.input_vcf = sys.stdin
# send back the user input
return args
class Vcf(object):
def __init__(self):
self.file_format = 'VCFv4.2'
# self.fasta = fasta
self.reference = ''
self.sample_list = []
self.info_list = []
self.format_list = []
self.alt_list = []
self.add_format('GT', 1, 'String', 'Genotype')
def add_header(self, header):
for line in header:
if line.split('=')[0] == '##fileformat':
self.file_format = line.rstrip().split('=')[1]
elif line.split('=')[0] == '##reference':
self.reference = line.rstrip().split('=')[1]
elif line.split('=')[0] == '##INFO':
a = line[line.find('<')+1:line.find('>')]
r = re.compile(r'(?:[^,\"]|\"[^\"]*\")+')
self.add_info(*[b.split('=')[1] for b in r.findall(a)])
elif line.split('=')[0] == '##ALT':
a = line[line.find('<')+1:line.find('>')]
r = re.compile(r'(?:[^,\"]|\"[^\"]*\")+')
self.add_alt(*[b.split('=')[1] for b in r.findall(a)])
elif line.split('=')[0] == '##FORMAT':
a = line[line.find('<')+1:line.find('>')]
r = re.compile(r'(?:[^,\"]|\"[^\"]*\")+')
self.add_format(*[b.split('=')[1] for b in r.findall(a)])
elif line[0] == '#' and line[1] != '#':
self.sample_list = line.rstrip().split('\t')[9:]
# return the VCF header
def get_header(self):
header = '\n'.join(['##fileformat=' + self.file_format,
'##fileDate=' + time.strftime('%Y%m%d'),
'##reference=' + self.reference] + \
[i.hstring for i in self.info_list] + \
[a.hstring for a in self.alt_list] + \
[f.hstring for f in self.format_list] + \
['\t'.join([
'#CHROM',
'POS',
'ID',
'REF',
'ALT',
'QUAL',
'FILTER',
'INFO',
'FORMAT'] + \
self.sample_list
)])
return header
def add_info(self, id, number, type, desc):
if id not in [i.id for i in self.info_list]:
inf = self.Info(id, number, type, desc)
self.info_list.append(inf)
def add_alt(self, id, desc):
if id not in [a.id for a in self.alt_list]:
alt = self.Alt(id, desc)
self.alt_list.append(alt)
def add_format(self, id, number, type, desc):
if id not in [f.id for f in self.format_list]:
fmt = self.Format(id, number, type, desc)
self.format_list.append(fmt)
def add_sample(self, name):
self.sample_list.append(name)
# get the VCF column index of a sample
# NOTE: this is zero-based, like python arrays
def sample_to_col(self, sample):
return self.sample_list.index(sample) + 9
class Info(object):
def __init__(self, id, number, type, desc):
self.id = str(id)
self.number = str(number)
self.type = str(type)
self.desc = str(desc)
# strip the double quotes around the string if present
if self.desc.startswith('"') and self.desc.endswith('"'):
self.desc = self.desc[1:-1]
self.hstring = '##INFO=<ID=' + self.id + ',Number=' + self.number + ',Type=' + self.type + ',Description=\"' + self.desc + '\">'
class Alt(object):
def __init__(self, id, desc):
self.id = str(id)
self.desc = str(desc)
# strip the double quotes around the string if present
if self.desc.startswith('"') and self.desc.endswith('"'):
self.desc = self.desc[1:-1]
self.hstring = '##ALT=<ID=' + self.id + ',Description=\"' + self.desc + '\">'
class Format(object):
def __init__(self, id, number, type, desc):
self.id = str(id)
self.number = str(number)
self.type = str(type)
self.desc = str(desc)
# strip the double quotes around the string if present
if self.desc.startswith('"') and self.desc.endswith('"'):
self.desc = self.desc[1:-1]
self.hstring = '##FORMAT=<ID=' + self.id + ',Number=' + self.number + ',Type=' + self.type + ',Description=\"' + self.desc + '\">'
class Variant(object):
def __init__(self, var_list, vcf):
self.chrom = var_list[0]
self.pos = int(var_list[1])
self.var_id = var_list[2]
self.ref = var_list[3]
self.alt = var_list[4]
if var_list[5] == '.':
self.qual = 0
else:
self.qual = float(var_list[5])
self.filter = var_list[6]
self.sample_list = vcf.sample_list
self.info_list = vcf.info_list
self.info = dict()
self.format_list = vcf.format_list
self.active_formats = list()
self.gts = dict()
# fill in empty sample genotypes
if len(var_list) < 8:
sys.stderr.write('\nError: VCF file must have at least 8 columns\n')
exit(1)
if len(var_list) < 9:
var_list.append("GT")
# make a genotype for each sample at variant
for s in self.sample_list:
try:
s_gt = var_list[vcf.sample_to_col(s)].split(':')[0]
self.gts[s] = Genotype(self, s, s_gt)
# import the existing fmt fields
for j in zip(var_list[8].split(':'), var_list[vcf.sample_to_col(s)].split(':')):
self.gts[s].set_format(j[0], j[1])
except IndexError:
self.gts[s] = Genotype(self, s, './.')
self.info = dict()
i_split = [a.split('=') for a in var_list[7].split(';')] # temp list of split info column
for i in i_split:
if len(i) == 1:
i.append(True)
self.info[i[0]] = i[1]
def set_info(self, field, value):
if field in [i.id for i in self.info_list]:
self.info[field] = value
else:
sys.stderr.write('\nError: invalid INFO field, \"' + field + '\"\n')
exit(1)
def get_info(self, field):
return self.info[field]
def get_info_string(self):
i_list = list()
for info_field in self.info_list:
if info_field.id in self.info.keys():
if info_field.type == 'Flag':
i_list.append(info_field.id)
else:
i_list.append('%s=%s' % (info_field.id, self.info[info_field.id]))
return ';'.join(i_list)
def get_format_string(self):
f_list = list()
for f in self.format_list:
if f.id in self.active_formats:
f_list.append(f.id)
return ':'.join(f_list)
def genotype(self, sample_name):
if sample_name in self.sample_list:
return self.gts[sample_name]
else:
sys.stderr.write('\nError: invalid sample name, \"' + sample_name + '\"\n')
def get_var_string(self):
s = '\t'.join(map(str,[
self.chrom,
self.pos,
self.var_id,
self.ref,
self.alt,
'%0.2f' % self.qual,
self.filter,
self.get_info_string(),
self.get_format_string(),
'\t'.join(self.genotype(s).get_gt_string() for s in self.sample_list)
]))
return s
class Genotype(object):
def __init__(self, variant, sample_name, gt):
self.format = dict()
self.variant = variant
self.set_format('GT', gt)
def set_format(self, field, value):
if field in [i.id for i in self.variant.format_list]:
self.format[field] = value
if field not in self.variant.active_formats:
self.variant.active_formats.append(field)
# sort it to be in the same order as the format_list in header
self.variant.active_formats.sort(key=lambda x: [f.id for f in self.variant.format_list].index(x))
else:
sys.stderr.write('\nError: invalid FORMAT field, \"' + field + '\"\n')
exit(1)
def get_format(self, field):
return self.format[field]
def get_gt_string(self):
g_list = list()
for f in self.variant.active_formats:
if f in self.format:
if type(self.format[f]) == float:
g_list.append('%0.2f' % self.format[f])
else:
g_list.append(self.format[f])
else:
g_list.append('.')
return ':'.join(map(str,g_list))
# efficient combinatorial function to handle extremely large numbers
def log_choose(n, k):
r = 0.0
# swap for efficiency if k is more than half of n
if k * 2 > n:
k = n - k
for d in xrange(1,k+1):
r += math.log(n, 10)
r -= math.log(d, 10)
n -= 1
return r
# return the genotype and log10 p-value
def bayes_gt(ref, alt, is_dup):
# probability of seeing an alt read with true genotype of of hom_ref, het, hom_alt respectively
if is_dup: # specialized logic to handle non-destructive events such as duplications
p_alt = [0.01, 0.3, 0.5]
else:
p_alt = [0.01, 0.5, 0.9]
total = ref + alt
lp_homref = log_choose(total, alt) + alt * math.log(p_alt[0], 10) + ref * math.log(1 - p_alt[0], 10)
lp_het = log_choose(total, alt) + alt * math.log(p_alt[1], 10) + ref * math.log(1 - p_alt[1], 10)
lp_homalt = log_choose(total, alt) + alt * math.log(p_alt[2], 10) + ref * math.log(1 - p_alt[2], 10)
return (lp_homref, lp_het, lp_homalt)
# return the 5' alignment coordinate of the mate read by parsing the MC (mate cigar) SAM field
# biobambam puts the coordinate of the mate's pair in the MC flag, while samblaster provides a CIGAR string
def get_mate_5prime(bam, read):
# if 'MC' in [t[0] for t in read.tags]:
try:
mc = read.opt('MC') # the mate CIGAR string
if mc == '*':
return
# try to find from a coordinate (biobambam style)
try:
p = int(mc)
# otherwise get it from a CIGAR string (samblaster style)
except ValueError:
keys = re.findall('[MIDNSHPX=]+', mc)
nums = map(int, re.findall('[^MIDNSHPX=]+', mc))
p = read.pnext
for i in xrange(len(keys)):
k = keys[i]
n = nums[i]
if k == 'M' or k == 'N' or k == 'D':
p += n
except KeyError:
p = bam.mate(read).aend
return p
def cigar_offset(mc):
keys = re.findall('[MIDNSHPX=]+', mc)
nums = map(int, re.findall('[^MIDNSHPX=]+', mc))
p = 0
for i in xrange(len(keys)):
k = keys[i]
if k == 'M' or k == 'N' or k == 'D':
n = nums[i]
p += n
return p
def get_mate_stuff(read, db):
"""
omq = read.opt("MQ")
omc = read.opt("MC")
try:
mq = read.opt("MQ")
mc = read.opt("MC")
try:
p = int(mc)
except ValueError:
p = read.pnext + cigar_offset(mc)
return mq, p
except KeyError:
pass
"""
key = "%s///%s" % (read.query_name, read.flag)
try:
mc, mq = db.get(key).split("\t")
mq = float(mq)
except:
print >>sys.stderr, key, read.is_secondary, read.is_proper_pair
raise
#assert omc == mc
#assert omq == mq
return float(mq), read.pnext + cigar_offset(mc)
def get_mate_mapq(bam, read):
# if 'MQ' in [t[0] for t in read.tags]:
try:
mq = read.opt('MQ') # the mate mapq score
if mq == '*':
return
except KeyError:
mq = bam.mate(read).mapq
return mq
# calculate the probability that a read is concordant at a deletion breakpoint,
# given the putative deletion size and insert distribution of the library.
def p_concordant(read_ospan, var_length, ins_dens):
conc_prior = 0.95 # a priori probability that a read-pair is concordant
disc_prior = 1 - conc_prior
try:
p = float(ins_dens[read_ospan]) * conc_prior / (conc_prior * ins_dens[read_ospan] + disc_prior * (ins_dens[read_ospan - var_length]))
except ZeroDivisionError:
p = None
return p
def count_pairedend(chrom,
pos,
ci,
mate_chrom,
mate_pos,
mate_ci,
o1,
o2,
svtype,
sample,
z,
discflank,
db):
conc_counter = 0
disc_counter = 0
conc_scaled_counter = 0
disc_scaled_counter = 0
fetch_flank = sample.get_fetch_flank(z)
# don't count paired-end reads if deletion is smaller than 2 * standard dev of library
if svtype == 'DEL':
for lib in sample.lib_dict.values():
if abs(mate_pos - pos) < 2 * lib.sd:
return (conc_counter, disc_counter, conc_scaled_counter, disc_scaled_counter)
if o1 == '+' or svtype == 'INV':
# survey for concordant read pairs
reads0 = []
for read in sample.bam.fetch(chrom, max(pos - (fetch_flank), 0), pos):
if read.is_unmapped: continue
reads0.append(read)
lib = sample.get_lib(read.opt('RG')) # get the read's library
if (read.is_reverse
or not read.mate_is_reverse
or (legacy and read.is_secondary)
or (not legacy and read.flag & 2048 == 2048)
or read.is_unmapped
or read.mate_is_unmapped
or read.is_duplicate
or read.pos + discflank > pos
or read.pnext + lib.read_length - discflank < pos
or read.tid != read.rnext
or lib not in sample.active_libs
):
continue
else:
try:
mate_mapq, p5 = get_mate_stuff(read, db)
ospan = p5 - read.pos
except:
continue
ispan1 = read.pos + discflank
ispan2 = p5 - discflank - 1 # speed up here
ispan = ispan2 - ispan1
prob_conc = p_concordant(ospan, abs(mate_pos - pos), lib.dens)
if ispan2 > pos and prob_conc is not None: # bail if neither concordant or discordant
conc_counter += 1
conc_scaled_counter += prob_conc * (1-10**(-read.mapq/10.0)) * (1-10**(-mate_mapq/10.0))
# if a deletion, iterate the discordants for these too
if svtype == 'DEL' and ispan2 >= mate_ci[0]:
# disc_p = (1 - prob_conc) * (1-10**(-read.mapq/10.0)) * (1-10**(-mate_mapq/10.0))
# if disc_p > 0.5:
# print ispan2, mate_ci[0]
# print read
disc_counter += 1
disc_scaled_counter += (1 - prob_conc) * (1-10**(-read.mapq/10.0)) * (1-10**(-mate_mapq/10.0))
# now look at discordants for corresponding SV types
if svtype != 'DEL':
readiter = reads0 or sample.bam.fetch(chrom, max(pos - (fetch_flank), 0), pos)
for read in readiter:
lib = sample.get_lib(read.opt('RG')) # get the read's library
if (read.is_reverse
or (legacy and read.is_secondary)
or (not legacy and read.flag & 2048 == 2048)
or read.is_unmapped
or read.mate_is_unmapped
or read.is_duplicate
or read.pos + discflank > pos
or sample.bam.getrname(read.rnext) != mate_chrom
or lib not in sample.active_libs
):
continue
try:
mate_mapq, mate_5prime = get_mate_stuff(read, db)
except:
continue
if svtype == 'DUP':
if not read.mate_is_reverse or read.pnext > read.pos or read.tid != read.rnext:
continue
ispan1 = read.pos + discflank
ispan2 = mate_5prime - discflank - 1
if ispan1 < pos and ispan2 < ispan1 and ispan2 >= mate_ci[0] and ispan2 <= (mate_ci[1] + (lib.mean + lib.sd * z)):
# print "+-", read
disc_counter += 1
disc_scaled_counter += (1-10**(-read.mapq/10.0)) * (1-10**(-mate_mapq/10.0))
elif svtype == 'INV':
if read.mate_is_reverse:
continue
ispan1 = read.pos + discflank
ispan2 = read.pnext + discflank
if ispan1 < pos and ispan2 <= mate_ci[1] and ispan2 >= (mate_ci[0] - (lib.mean + lib.sd * z)):
# print "++", read
disc_counter += 1
disc_scaled_counter += (1-10**(-read.mapq/10.0)) * (1-10**(-mate_mapq/10.0))
elif svtype == 'BND':
if o2 == '-':
if not read.mate_is_reverse or read.pnext + lib.read_length - discflank < mate_pos:
continue
ispan1 = read.pos + discflank
ispan2 = mate_5prime - discflank - 1
if ispan1 < pos and ispan2 >= mate_ci[0] and ispan2 <= (mate_ci[1] + (lib.mean + lib.sd * z)):
# print "+- BND", read
disc_counter += 1
disc_scaled_counter += (1-10**(-read.mapq/10.0)) * (1-10**(-mate_mapq/10.0))
if o2 == '+':
if read.mate_is_reverse or read.pnext + discflank > mate_pos:
continue
ispan1 = read.pos + discflank
ispan2 = read.pnext + discflank
if ispan1 < pos and ispan2 <= mate_ci[1] and ispan2 >= (mate_ci[0] - (lib.mean + lib.sd * z)):
disc_counter += 1
disc_scaled_counter += (1-10**(-read.mapq/10.0)) * (1-10**(-mate_mapq/10.0))
reads1 = []
if o1 == '-' or svtype == 'INV':
# survey for concordant read pairs
for read in sample.bam.fetch(chrom, max(pos, 0), pos + (fetch_flank)):
if read.is_unmapped: continue
reads1.append(read)
lib = sample.get_lib(read.opt('RG')) # get the read's library
if (not read.is_reverse
or read.mate_is_reverse
or (legacy and read.is_secondary)
or (not legacy and read.flag & 2048 == 2048)
or read.is_unmapped
or read.mate_is_unmapped
or read.is_duplicate
or read.aend - discflank < pos
or read.pnext + discflank > pos
or read.tid != read.rnext
or lib not in sample.active_libs
):
continue
else:
#mate_mapq = get_mate_mapq(sample.bam, read)
try:
mate_mapq, p5 = get_mate_stuff(read, db)
ospan = read.aend - read.pnext
except:
continue
ispan1 = read.aend - discflank - 1
ispan2 = read.pnext + discflank
ispan = ispan1 - ispan2
prob_conc = p_concordant(ospan, abs(mate_pos - pos), lib.dens)
if ispan2 < pos and prob_conc is not None: # bail if neither concordant or discordant
conc_counter += 1
conc_scaled_counter += prob_conc * (1-10**(-read.mapq/10.0)) * (1-10**(-mate_mapq/10.0))
if svtype == 'DEL' and ispan2 <= mate_ci[1]:
# disc_p = (1 - prob_conc) * (1-10**(-read.mapq/10.0)) * (1-10**(-mate_mapq/10.0))
# if disc_p > 0.5:
# print ispan1, mate_ci[1]
# print read
disc_counter += 1
disc_scaled_counter += (1 - prob_conc) * (1-10**(-read.mapq/10.0)) * (1-10**(-mate_mapq/10.0))
# now look at discordants for corresponding SV types
if svtype != 'DEL':
readiter = reads1 or sample.bam.fetch(chrom, max(pos, 0), pos + (fetch_flank))
for read in readiter:
lib = sample.get_lib(read.opt('RG')) # get the read's library
if (not read.is_reverse
or (legacy and read.is_secondary)
or (not legacy and read.flag & 2048 == 2048)
or read.is_unmapped
or read.mate_is_unmapped
or read.is_duplicate
or read.aend - discflank < pos
or sample.bam.getrname(read.rnext) != mate_chrom
or lib not in sample.active_libs
):
continue
#mate_mapq = get_mate_mapq(sample.bam, read) # move this for speed
#mate_5prime = get_mate_5prime(sample.bam, read)
try:
mate_mapq, mate_5prime = get_mate_stuff(read, db)
except:
continue
if svtype == 'DUP':
if read.mate_is_reverse or read.pnext < read.pos or read.tid != read.rnext:
continue
ispan1 = read.aend - discflank - 1
ispan2 = read.pnext + discflank
if ispan1 > pos and ispan2 > ispan1 and ispan2 <= mate_ci[1] and ispan2 >= (mate_ci[0] - (lib.mean + lib.sd * z)):
# print "-+", read
disc_counter += 1
disc_scaled_counter += (1-10**(-read.mapq/10.0)) * (1-10**(-mate_mapq/10.0))
elif svtype == 'INV':
if not read.mate_is_reverse:
continue
ispan1 = read.aend - discflank - 1
ispan2 = mate_5prime - discflank - 1
if ispan1 > pos and ispan2 >= mate_ci[0] and ispan2 <= (mate_ci[1] + (lib.mean + lib.sd * z)):
# print "--", read
disc_counter += 1
disc_scaled_counter += (1-10**(-read.mapq/10.0)) * (1-10**(-mate_mapq/10.0))
elif svtype == 'BND':
if o2 == '-':
if not read.mate_is_reverse or read.pnext + lib.read_length - discflank < mate_pos:
continue
ispan1 = read.aend - discflank - 1
ispan2 = mate_5prime - discflank - 1
if ispan1 > pos and ispan2 >= mate_ci[0] and ispan2 <= (mate_ci[1] + (lib.mean + lib.sd * z)):
disc_counter += 1
disc_scaled_counter += (1-10**(-read.mapq/10.0)) * (1-10**(-mate_mapq/10.0))
if o2 == '+':
if read.mate_is_reverse or read.pnext + discflank > mate_pos:
continue
ispan1 = read.aend - discflank - 1
ispan2 = read.pnext + discflank
if ispan1 > pos and ispan2 <= mate_ci[1] and ispan2 >= (mate_ci[0] - (lib.mean + lib.sd * z)):
disc_counter += 1
disc_scaled_counter += (1-10**(-read.mapq/10.0)) * (1-10**(-mate_mapq/10.0))
return (conc_counter, disc_counter, conc_scaled_counter, disc_scaled_counter)
# get the number of entries in the set
def countRecords(myCounter):
numRecords = sum(myCounter.values())
return numRecords
# median is approx 50th percentile, except when it is between
# two values in which case it's the mean of them.
def median(myCounter):
#length is the number of bases we're looking at
numEntries = countRecords(myCounter)
# the ordinal value of the middle element
# if 2 middle elements, then non-integer
limit = 0.5 * numEntries
# a list of the values, sorted smallest to largest
# note that this list contains unique elements only
valueList = list(myCounter)
valueList.sort()
# number of entries we've gone through
runEntries = 0
# index of the current value in valueList
i = 0
# initiate v, in case list only has one element
v = valueList[i]
# move through the value list, iterating by number of
# entries for each value
while runEntries < limit:
v = valueList[i]
runEntries += myCounter[v]
i += 1
if runEntries == limit:
return (v + valueList[i]) / 2.0
else:
return v
# calculate upper median absolute deviation
def upper_mad(myCounter, myMedian):
residCounter = Counter()
for x in myCounter:
if x > myMedian:
residCounter[abs(x - myMedian)] += myCounter[x]
return median(residCounter)
# sum of the entries
def sumRecords(myCounter):
mySum = 0.0
for c in myCounter:
mySum += c * float(myCounter[c])
return mySum
# calculate the arithmetic mean, given a counter and the
# length of the feature (chromosome or genome)
# for x percentile, x% of the elements in the set are
# <= the output value
def mean(myCounter):
# the number of total entries in the set is the
# sum of the occurrences for each value
numRecords = countRecords(myCounter)
# u holds the mean
u = float()
u = sumRecords(myCounter) / numRecords
return u
def stdev(myCounter):
# the number of total entries in the set is the
# sum of the occurrences for each value
numRecords = countRecords(myCounter)
# u holds the mean
u = mean(myCounter)
sumVar = 0.0
# stdev is sqrt(sum((x-u)^2)/#elements)
for c in myCounter:
sumVar += myCounter[c] * (c - u)**2
myVariance = float(sumVar) / numRecords
stdev = myVariance**(0.5)
return stdev
# holds a library's insert size and read length information
class Library(object):
def __init__(self, bam, name, num_samp):
self.bam = bam
self.name = name
self.num_samp = num_samp
self.readgroups = list()
self.read_length = int()
self.hist = dict()
self.dens = dict()
self.mean = float()
self.sd = float()
self.prevalence = float()
def add_readgroup(self, rg):
self.readgroups.append(rg)
# calculate the library's prevalence in the BAM file
def calc_lib_prevalence(self):
max_count = 100000
lib_counter = 0
read_counter = 0
for read in self.bam.fetch():
if read_counter == max_count:
break
if read.opt('RG') in self.readgroups:
lib_counter += 1
read_counter += 1
self.prevalence = float(lib_counter) / read_counter
# get read length
def calc_read_length(self):
max_rl = 0
counter = 0
num_samp = 10000
for read in self.bam.fetch():
if read.opt('RG') not in self.readgroups:
continue
if read.qlen > max_rl:
max_rl = read.qlen
if counter == num_samp:
break
counter += 1
self.read_length = max_rl
# generate empirical histogram of the sample's insert size distribution
# CC NOTE: REMOVE BEYOND Z stdev!!!!
def calc_insert_hist(self):
counter = 0
skip = 0
skip_counter = 0
mads = 10
ins_list = []
# Each entry in valueCounts is a value, and its count is
# the number of instances of that value observed in the dataset.
# So valueCount[5] is the number of times 5 has been seen in the data.
valueCounts = Counter()
for read in self.bam:
if skip_counter < skip:
skip_counter += 1
continue
if (read.is_reverse
or not read.mate_is_reverse
or read.is_unmapped
or read.mate_is_unmapped
or (legacy and read.is_secondary)
or (not legacy and read.flag & 2048 == 2048)
or read.tlen <= 0
or read.opt('RG') not in self.readgroups):
continue
else:
valueCounts[read.tlen] += 1
counter += 1
if counter == self.num_samp:
break
# remove outliers
med = median(valueCounts)
u_mad = upper_mad(valueCounts, med)
for x in [x for x in list(valueCounts) if x > med + mads * u_mad]:
del valueCounts[x]
self.hist = valueCounts
self.mean = mean(self.hist)
self.sd = stdev(self.hist)
# calculate the density curve for and insert size histogram
def calc_insert_density(self):
dens = Counter()
for i in list(self.hist):
dens[i] = float(self.hist[i])/countRecords(self.hist)
self.dens = dens
# holds each sample's BAM and library information
class Sample(object):
def __init__(self, bam, spl_bam, num_samp, min_lib_prevalence):
self.name = bam.header['RG'][0]['SM']
self.bam = bam
self.spl_bam = spl_bam
self.lib_dict = dict()
self.active_libs = []
# parse library design
self.rg_to_lib = dict()
for r in self.bam.header['RG']:
try:
lib_name=r['LB']
except KeyError, e:
lib_name=''
# add the new library
if lib_name not in self.lib_dict:
new_lib = Library(self.bam, lib_name, num_samp)
self.lib_dict[lib_name] = new_lib
self.rg_to_lib[r['ID']] = self.lib_dict[lib_name]
self.lib_dict[lib_name].add_readgroup(r['ID'])
# execute calculations
for name in list(self.lib_dict):
self.lib_dict[name].calc_lib_prevalence()
# delete library if it constitutes less than min fraction of BAM file
if self.lib_dict[name].prevalence < min_lib_prevalence:
self.lib_dict.pop(name)
continue
self.lib_dict[name].calc_read_length()
self.lib_dict[name].calc_insert_hist()
self.lib_dict[name].calc_insert_density()
self.active_libs += [self.lib_dict[name]]
# get the maximum fetch flank for reading the BAM file
def get_fetch_flank(self, z):
return max([lib.mean + (lib.sd * z) for lib in self.lib_dict.values()])
# return the library object for a specified read group
def get_lib(self, readgroup):
return self.rg_to_lib[readgroup]
# primary function
def sv_genotype(vcf_file,
bam_string,
spl_bam_string,
vcf_out,
splflank,
discflank,
split_weight,
disc_weight,
num_samp,
debug,
rocksdb_path):
# parse the comma separated inputs
bam_list = [pysam.Samfile(b, 'rb') for b in bam_string.split(',')]
if spl_bam_string is not None:
spl_bam_list = [pysam.Samfile(b, 'rb') for b in spl_bam_string.split(',')]
if len(bam_list) != len(spl_bam_list):
sys.stderr.write('\nError: Number of full BAMs and splitters BAMs differ.\n')
exit(1)
else:
sys.stderr.write('Warning: Splitters BAM file (-S) not found. Performance will suffer.\n')
min_lib_prevalence = 1e-3 # only consider libraries that constitute at least this fraction of the BAM
sample_list = list()
for i in xrange(len(bam_list)):
if spl_bam_string is None:
sample = Sample(bam_list[i], None, num_samp, min_lib_prevalence)
else:
sample = Sample(bam_list[i], spl_bam_list[i], num_samp, min_lib_prevalence)
sample_list.append(sample)
z = 3
padding = 30 # this is the left/right distance for fetching splitters
split_slop = 3 # amount of slop around breakpoint to count splitters
in_header = True
header = []
breakend_dict = {} # cache to hold unmatched generic breakends for genotyping
vcf = Vcf()
import rocksdb
mdb = rocksdb.open(rocksdb_path)
# read input VCF
for line in vcf_file:
if in_header:
if line[0] == '#':
header.append(line)
if line[1] != '#':
vcf_samples = line.rstrip().split('\t')[9:]
continue
else:
in_header = False
vcf.add_header(header)
# if detailed:
vcf.add_format('GQ', 1, 'Integer', 'Genotype quality')
vcf.add_format('SQ', 1, 'Float', 'Phred-scaled probability that this site is variant (non-reference in this sample')
vcf.add_format('GL', 'G', 'Float', 'Genotype Likelihood, log10-scaled likelihoods of the data given the called genotype for each possible genotype generated from the reference and alternate alleles given the sample ploidy')
vcf.add_format('DP', 1, 'Integer', 'Read depth')
vcf.add_format('RO', 1, 'Integer', 'Reference allele observation count, with partial observations recorded fractionally')
vcf.add_format('AO', 'A', 'Integer', 'Alternate allele observations, with partial observations recorded fractionally')
vcf.add_format('QR', 1, 'Integer', 'Sum of quality of reference observations')
vcf.add_format('QA', 'A', 'Integer', 'Sum of quality of alternate observations')
vcf.add_format('RS', 1, 'Integer', 'Reference allele split-read observation count, with partial observations recorded fractionally')
vcf.add_format('AS', 'A', 'Integer', 'Alternate allele split-read observation count, with partial observations recorded fractionally')
vcf.add_format('RP', 1, 'Integer', 'Reference allele paired-end observation count, with partial observations recorded fractionally')
vcf.add_format('AP', 'A', 'Integer', 'Alternate allele paired-end observation count, with partial observations recorded fractionally')
vcf.add_format('AB', 'A', 'Float', 'Allele balance, fraction of observations from alternate allele, QA/(QR+QA)')
# add the samples in the BAM files to the VCF output
for sample in sample_list:
if sample.name not in vcf.sample_list:
vcf.add_sample(sample.name)
# write the output header
vcf_out.write(vcf.get_header() + '\n')
v = line.rstrip().split('\t')
var = Variant(v, vcf)
# genotype generic breakends
if var.info['SVTYPE']=='BND':
if var.info['MATEID'] in breakend_dict:
var2 = var
var = breakend_dict[var.info['MATEID']]
chromA = var.chrom
chromB = var2.chrom
posA = var.pos
posB = var2.pos
# confidence intervals
ciA = [posA + ci for ci in map(int, var.info['CIPOS'].split(','))]
ciB = [posB + ci for ci in map(int, var2.info['CIPOS'].split(','))]
# infer the strands from the alt allele
if var.alt[-1] == '[' or var.alt[-1] == ']':
o1 = '+'
else: o1 = '-'
if var2.alt[-1] == '[' or var2.alt[-1] == ']':
o2 = '+'
else: o2 = '-'
else:
breakend_dict[var.var_id] = var
continue
else:
chromA = var.chrom
chromB = var.chrom
posA = var.pos
posB = int(var.get_info('END'))
# confidence intervals
ciA = [posA + ci for ci in map(int, var.info['CIPOS'].split(','))]
ciB = [posB + ci for ci in map(int, var.info['CIEND'].split(','))]
if var.get_info('SVTYPE') == 'DEL':
o1, o2 = '+', '-'
elif var.get_info('SVTYPE') == 'DUP':
o1, o2 = '-', '+'
elif var.get_info('SVTYPE') == 'INV':
o1, o2 = '+', '+'
# increment the negative strand values (note position in VCF should be the base immediately left of the breakpoint junction)
if o1 == '-': posA += 1
if o2 == '-': posB += 1
# if debug: print posA, posB
# for i in xrange(len(bam_list)):
for sample in sample_list:
'''
Breakend A
'''
# Count splitters
ref_counter_a = Counter()
spl_counter_a = Counter()
ref_scaled_counter_a = Counter()
spl_scaled_counter_a = Counter()
if spl_bam_string is not None:
for ref_read in sample.bam.fetch(chromA, max(posA - padding, 0), posA + padding + 1):
if not ref_read.is_duplicate and not ref_read.is_unmapped:
for p in xrange(ref_read.pos + splflank, ref_read.aend + 1 - splflank):
if p - ref_read.pos >= splflank and ref_read.aend - p >= splflank:
ref_counter_a[p] += 1
ref_scaled_counter_a[p] += (1-10**(-ref_read.mapq/10.0))
for spl_read in sample.spl_bam.fetch(chromA, max(posA - padding, 0), posA + padding + 1):
if not spl_read.is_duplicate and not spl_read.is_unmapped:
if o1 == '+' and spl_read.cigar[0][0] == 0:
# if debug: print 'o1+', spl_read.aend
spl_counter_a[spl_read.aend] += 1
spl_scaled_counter_a[spl_read.aend] += (1-10**(-spl_read.mapq/10.0))
elif o1 == '-' and spl_read.cigar[-1][0] == 0:
# if debug: print 'o1-', spl_read.pos + 1
spl_counter_a[spl_read.pos + 1] += 1
spl_scaled_counter_a[spl_read.pos + 1] += (1-10**(-spl_read.mapq/10.0))
# Count paired-end discordant and concordants
(conc_counter_a,
disc_counter_a,
conc_scaled_counter_a,
disc_scaled_counter_a) = count_pairedend(chromA, posA, ciA,
chromB, posB, ciB,
o1, o2,
var.info['SVTYPE'],
sample,
z, discflank, db)
'''
Breakend B
'''
# Count splitters
ref_counter_b = Counter()
spl_counter_b = Counter()
ref_scaled_counter_b = Counter()
spl_scaled_counter_b = Counter()
if spl_bam_string is not None:
for ref_read in sample.bam.fetch(chromB, max(posB - padding, 0), posB + padding + 1):
if not ref_read.is_duplicate and not ref_read.is_unmapped:
for p in xrange(ref_read.pos + splflank, ref_read.aend + 1 - splflank):
if p - ref_read.pos >= splflank and ref_read.aend - p >= splflank:
ref_counter_b[p] += 1
ref_scaled_counter_b[p] += (1-10**(-ref_read.mapq/10.0))
for spl_read in sample.spl_bam.fetch(chromB, max(posB - padding, 0), posB + padding + 1):
if not spl_read.is_duplicate and not spl_read.is_unmapped:
if o2 == '+' and spl_read.cigar[0][0] == 0:
spl_counter_b[spl_read.aend] += 1
# if debug: print 'o2+', spl_read.aend
spl_scaled_counter_b[spl_read.aend] += (1-10**(-spl_read.mapq/10.0))
elif o2 == '-' and spl_read.cigar[-1][0] == 0:
# if debug: print 'o2-', spl_read.pos + 1
spl_counter_b[spl_read.pos + 1] += 1
spl_scaled_counter_b[spl_read.pos + 1] += (1-10**(-spl_read.mapq/10.0))
# tally up the splitters
sr_ref_a = int(round(sum(ref_counter_a[p] for p in xrange(posA - split_slop, posA + split_slop + 1)) / float(2 * split_slop + 1)))
sr_spl_a = sum(spl_counter_a[p] for p in xrange(posA-split_slop, posA+split_slop + 1))
sr_ref_b = int(round(sum(ref_counter_b[p] for p in xrange(posB - split_slop, posB + split_slop + 1)) / float(2 * split_slop + 1)))
sr_spl_b = sum(spl_counter_b[p] for p in xrange(posB - split_slop, posB + split_slop + 1))
sr_ref_scaled_a = sum(ref_scaled_counter_a[p] for p in xrange(posA - split_slop, posA + split_slop + 1)) / float(2 * split_slop + 1)
sr_spl_scaled_a = sum(spl_scaled_counter_a[p] for p in xrange(posA-split_slop, posA+split_slop + 1))
sr_ref_scaled_b = sum(ref_scaled_counter_b[p] for p in xrange(posB - split_slop, posB + split_slop + 1)) / float(2 * split_slop + 1)
sr_spl_scaled_b = sum(spl_scaled_counter_b[p] for p in xrange(posB - split_slop, posB + split_slop + 1))
# Count paired-end discordants and concordants
(conc_counter_b,
disc_counter_b,
conc_scaled_counter_b,
disc_scaled_counter_b) = count_pairedend(chromB, posB, ciB,
chromA, posA, ciA,
o2, o1,
var.info['SVTYPE'],
sample,
z, discflank, db)
if debug:
print '--------------------'
print sample.name
print 'sr_a', '(ref, alt)', sr_ref_a, sr_spl_a
print 'pe_a', '(ref, alt)', conc_counter_a, disc_counter_a
print 'sr_b', '(ref, alt)', sr_ref_b, sr_spl_b
print 'pe_b', '(ref, alt)', conc_counter_b, disc_counter_b
print 'sr_a_scaled', '(ref, alt)', sr_ref_scaled_a, sr_spl_scaled_a
print 'pe_a_scaled', '(ref, alt)', conc_scaled_counter_a, disc_scaled_counter_a
print 'sr_b_scaled', '(ref, alt)', sr_ref_scaled_b, sr_spl_scaled_b
print 'pe_b_scaled', '(ref, alt)', conc_scaled_counter_b, disc_scaled_counter_b
# merge the breakend support
split_ref = 0 # set these to zero unless there are informative alt bases for the ev type
disc_ref = 0
split_alt = sr_spl_a + sr_spl_b
if split_alt > 0:
split_ref = sr_ref_a + sr_ref_b
disc_alt = disc_counter_a + disc_counter_b
if disc_alt > 0:
disc_ref = conc_counter_a + conc_counter_b
if split_alt == 0 and disc_alt == 0:
split_ref = sr_ref_a + sr_ref_b
disc_ref = conc_counter_a + conc_counter_b
split_scaled_ref = 0 # set these to zero unless there are informative alt bases for the ev type
disc_scaled_ref = 0
split_scaled_alt = sr_spl_scaled_a + sr_spl_scaled_b
if int(split_scaled_alt) > 0:
split_scaled_ref = sr_ref_scaled_a + sr_ref_scaled_b
disc_scaled_alt = disc_scaled_counter_a + disc_scaled_counter_b
if int(disc_scaled_alt) > 0:
disc_scaled_ref = conc_scaled_counter_a + conc_scaled_counter_b
if int(split_scaled_alt) == 0 and int(disc_scaled_alt) == 0: # if no alt alleles, set reference
split_scaled_ref = sr_ref_scaled_a + sr_ref_scaled_b
disc_scaled_ref = conc_scaled_counter_a + conc_scaled_counter_b
if split_scaled_alt + split_scaled_ref + disc_scaled_alt + disc_scaled_ref > 0:
# get bayesian classifier
if var.info['SVTYPE'] == "DUP": is_dup = True
else: is_dup = False
QR = int(split_weight * split_scaled_ref) + int(disc_weight * disc_scaled_ref)
QA = int(split_weight * split_scaled_alt) + int(disc_weight * disc_scaled_alt)
gt_lplist = bayes_gt(QR, QA, is_dup)
gt_idx = gt_lplist.index(max(gt_lplist))
# print log probabilities of homref, het, homalt
if debug:
print gt_lplist
# set the overall variant QUAL score and sample specific fields
var.genotype(sample.name).set_format('GL', ','.join(['%.0f' % x for x in gt_lplist]))
var.genotype(sample.name).set_format('DP', int(split_scaled_ref + split_scaled_alt + disc_scaled_ref + disc_scaled_alt))
var.genotype(sample.name).set_format('RO', int(split_scaled_ref + disc_scaled_ref))
var.genotype(sample.name).set_format('AO', int(split_scaled_alt + disc_scaled_alt))
var.genotype(sample.name).set_format('QR', QR)
var.genotype(sample.name).set_format('QA', QA)
# if detailed:
var.genotype(sample.name).set_format('RS', int(split_scaled_ref))
var.genotype(sample.name).set_format('AS', int(split_scaled_alt))
var.genotype(sample.name).set_format('RP', int(disc_scaled_ref))
var.genotype(sample.name).set_format('AP', int(disc_scaled_alt))
try:
var.genotype(sample.name).set_format('AB', '%.2g' % (QA / float(QR + QA)))
except ZeroDivisionError:
var.genotype(sample.name).set_format('AB', '.')
# assign genotypes
gt_sum = 0
for gt in gt_lplist:
try:
gt_sum += 10**gt
except OverflowError:
gt_sum += 0
if gt_sum > 0:
gt_sum_log = math.log(gt_sum, 10)
sample_qual = abs(-10 * (gt_lplist[0] - gt_sum_log)) # phred-scaled probability site is non-reference in this sample
if 1 - (10**gt_lplist[gt_idx] / 10**gt_sum_log) == 0:
phred_gq = 200
else:
phred_gq = abs(-10 * math.log(1 - (10**gt_lplist[gt_idx] / 10**gt_sum_log), 10))
var.genotype(sample.name).set_format('GQ', int(phred_gq))
var.genotype(sample.name).set_format('SQ', sample_qual)
var.qual += sample_qual
if gt_idx == 1:
var.genotype(sample.name).set_format('GT', '0/1')
elif gt_idx == 2:
var.genotype(sample.name).set_format('GT', '1/1')
elif gt_idx == 0:
var.genotype(sample.name).set_format('GT', '0/0')
else:
var.genotype(sample.name).set_format('GQ', '.')
var.genotype(sample.name).set_format('SQ', '.')
var.genotype(sample.name).set_format('GT', './.')
else:
var.genotype(sample.name).set_format('GT', './.')
var.qual = 0
var.genotype(sample.name).set_format('GQ', '.')
var.genotype(sample.name).set_format('SQ', '.')
var.genotype(sample.name).set_format('GL', '.')
var.genotype(sample.name).set_format('DP', 0)
var.genotype(sample.name).set_format('AO', 0)
var.genotype(sample.name).set_format('RO', 0)
# if detailed:
var.genotype(sample.name).set_format('AS', 0)
var.genotype(sample.name).set_format('RS', 0)
var.genotype(sample.name).set_format('AP', 0)
var.genotype(sample.name).set_format('RP', 0)
var.genotype(sample.name).set_format('QR', 0)
var.genotype(sample.name).set_format('QA', 0)
var.genotype(sample.name).set_format('AB', '.')
# after all samples have been processed, write
vcf_out.write(var.get_var_string() + '\n')
if var.info['SVTYPE'] == 'BND':
var2.qual = var.qual
var2.active_formats = var.active_formats
var2.genotype = var.genotype
vcf_out.write(var2.get_var_string() + '\n')
vcf_out.close()
return
# --------------------------------------
# main function
def main():
# parse the command line args
args = get_args()
global legacy
legacy = args.legacy
# call primary function
sv_genotype(args.input_vcf,
args.bam,
args.split_bam,
args.output_vcf,
args.splflank,
args.discflank,
args.split_weight,
args.disc_weight,
args.num_samp,
args.debug,
args.rocksdb)
# close the files
args.input_vcf.close()
# initialize the script
if __name__ == '__main__':
try:
sys.exit(main())
except IOError, e:
if e.errno != 32: # ignore SIGPIPE
raise
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment