Created
March 12, 2020 14:50
-
-
Save jodyphelan/62250e11ed19bff12c8a6705c0a28d88 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import gzip | |
import os.path | |
import subprocess | |
from collections import defaultdict | |
import random | |
import argparse | |
rand_generator = random.SystemRandom() | |
def get_random_file(prefix = None,extension=None): | |
randint = rand_generator.randint(1,999999) | |
if prefix: | |
if extension: | |
return "%s.%s%s" % (prefix,randint,extension) | |
else: | |
return "%s.%s.txt" % (prefix,randint) | |
else: | |
if extension: | |
return "%s.tmp%s" % (randint,extension) | |
else: | |
return "%s.tmp.txt" % (randint) | |
class vcf_class: | |
def __init__(self,filename,threads=4): | |
self.samples = [] | |
self.filename = filename | |
self.threads = threads | |
self.prefix = filename[:-7] | |
if nofile(filename+".csi"): | |
run_cmd("bcftools index %(filename)s" % vars(self)) | |
self.temp_file = get_random_file() | |
run_cmd("bcftools query -l %(filename)s > %(temp_file)s" % vars(self)) | |
for l in open(self.temp_file): | |
self.samples.append(l.rstrip()) | |
os.remove(self.temp_file) | |
def vcf_to_fasta(self,ref_file,threads=4,chunk_size = 50000): | |
self.ref_file = ref_file | |
self.chunk_size = chunk_size | |
self.cmd_split_chr = "bedtools makewindows -g %(ref_file)s.fai -w %(chunk_size)s -s %(chunk_size)s | awk '{print $1\":\"$2\"-\"$3}'" % vars(self) | |
self.tmp_file = "%s.tmp.txt" % self.prefix | |
self.threads = threads | |
cmd = "%(cmd_split_chr)s | parallel -j %(threads)s \"bcftools view %(filename)s -r {} | bcftools view -V indels | bcftools filter -e 'GT=\\\"het\\\"' -S . | bcftools view -c 1 -a | bcftools norm -f %(ref_file)s | bcftools query -f '%%POS[\\t%%IUPACGT]\\n' | sed 's/\.[\/|]\./N/g' | datamash transpose > %(prefix)s.{}.tmp.txt\"" % vars(self) | |
run_cmd(cmd) | |
cmd = "paste `%(cmd_split_chr)s | awk '{print \"%(prefix)s.\"$1\".tmp.txt\"}'` > %(tmp_file)s" % vars(self) | |
run_cmd(cmd) | |
cmd = "rm `%(cmd_split_chr)s | awk '{print \"%(prefix)s.\"$1\".tmp.txt\"}'`" % vars(self) | |
run_cmd(cmd) | |
with open(self.prefix+".snps.fa","w") as O: | |
for i,l in enumerate(open(self.tmp_file)): | |
row = l.rstrip().split() | |
if i==0: continue | |
s = self.samples[i-1] | |
seq = "".join(row) | |
O.write(">%s\n%s\n" % ( s,seq)) | |
run_cmd("rm %s" % self.tmp_file) | |
def nofile(filename): | |
""" | |
Return True if file does not exist | |
""" | |
if not os.path.isfile(filename): | |
return True | |
else: | |
return False | |
def run_cmd(cmd,verbose=1,target=None): | |
""" | |
Wrapper to run a command using subprocess with 3 levels of verbosity and automatic exiting if command failed | |
""" | |
if target and (target): return True | |
cmd = "set -u pipefail; " + cmd | |
if verbose==2: | |
sys.stderr.write("\nRunning command:\n%s\n" % cmd) | |
stdout = open("/dev/stdout","w") | |
stderr = open("/dev/stderr","w") | |
elif verbose==1: | |
sys.stderr.write("\nRunning command:\n%s\n" % cmd) | |
stdout = open("/dev/null","w") | |
stderr = open("/dev/null","w") | |
else: | |
stdout = open("/dev/null","w") | |
stderr = open("/dev/null","w") | |
res = subprocess.call(cmd,shell=True,stderr = stderr,stdout = stdout) | |
stderr.close() | |
if res!=0: | |
sys.stderr.write("Command Failed! Please Check!") | |
exit(1) | |
def main(args): | |
vcf_obj = vcf_class(args.vcf) | |
if args.snps: | |
vcf_obj.vcf_to_fasta(args.ref) | |
else: | |
args.sample_file = get_random_file() | |
open(args.sample_file,"w").write("\n".join(vcf_obj.samples)+"\n") | |
run_cmd('cat %(sample_file)s | parallel --bar -j %(threads)s "bcftools consensus -f %(ref)s -s {} %(vcf)s | sed \'s/^>.*/>{}/\' > {}.tmp.fasta"' % vars(args)) | |
run_cmd('cat %s > %s.fa' % (" ".join(["%s.tmp.fasta" % s for s in vcf_obj.samples]), vcf_obj.prefix)) | |
run_cmd('rm %s %s' % (" ".join(["%s.tmp.fasta" % s for s in vcf_obj.samples]), args.sample_file)) | |
parser = argparse.ArgumentParser(description='VCF mergin pipeline',formatter_class=argparse.ArgumentDefaultsHelpFormatter) | |
parser.add_argument('--vcf',help='sample file',required=True) | |
parser.add_argument('--ref',help='reference file',required=True) | |
parser.add_argument('--snps',action="store_true",help='Only use SNPs') | |
parser.add_argument('--threads','-t',default=4,help='Number of threads') | |
parser.set_defaults(func=main) | |
args = parser.parse_args() | |
args.func(args) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment