Skip to content

Instantly share code, notes, and snippets.

@jodyphelan
Created March 12, 2020 14:50
Show Gist options
  • Save jodyphelan/62250e11ed19bff12c8a6705c0a28d88 to your computer and use it in GitHub Desktop.
Save jodyphelan/62250e11ed19bff12c8a6705c0a28d88 to your computer and use it in GitHub Desktop.
import sys
import gzip
import os.path
import subprocess
from collections import defaultdict
import random
import argparse
rand_generator = random.SystemRandom()
def get_random_file(prefix = None,extension=None):
randint = rand_generator.randint(1,999999)
if prefix:
if extension:
return "%s.%s%s" % (prefix,randint,extension)
else:
return "%s.%s.txt" % (prefix,randint)
else:
if extension:
return "%s.tmp%s" % (randint,extension)
else:
return "%s.tmp.txt" % (randint)
class vcf_class:
def __init__(self,filename,threads=4):
self.samples = []
self.filename = filename
self.threads = threads
self.prefix = filename[:-7]
if nofile(filename+".csi"):
run_cmd("bcftools index %(filename)s" % vars(self))
self.temp_file = get_random_file()
run_cmd("bcftools query -l %(filename)s > %(temp_file)s" % vars(self))
for l in open(self.temp_file):
self.samples.append(l.rstrip())
os.remove(self.temp_file)
def vcf_to_fasta(self,ref_file,threads=4,chunk_size = 50000):
self.ref_file = ref_file
self.chunk_size = chunk_size
self.cmd_split_chr = "bedtools makewindows -g %(ref_file)s.fai -w %(chunk_size)s -s %(chunk_size)s | awk '{print $1\":\"$2\"-\"$3}'" % vars(self)
self.tmp_file = "%s.tmp.txt" % self.prefix
self.threads = threads
cmd = "%(cmd_split_chr)s | parallel -j %(threads)s \"bcftools view %(filename)s -r {} | bcftools view -V indels | bcftools filter -e 'GT=\\\"het\\\"' -S . | bcftools view -c 1 -a | bcftools norm -f %(ref_file)s | bcftools query -f '%%POS[\\t%%IUPACGT]\\n' | sed 's/\.[\/|]\./N/g' | datamash transpose > %(prefix)s.{}.tmp.txt\"" % vars(self)
run_cmd(cmd)
cmd = "paste `%(cmd_split_chr)s | awk '{print \"%(prefix)s.\"$1\".tmp.txt\"}'` > %(tmp_file)s" % vars(self)
run_cmd(cmd)
cmd = "rm `%(cmd_split_chr)s | awk '{print \"%(prefix)s.\"$1\".tmp.txt\"}'`" % vars(self)
run_cmd(cmd)
with open(self.prefix+".snps.fa","w") as O:
for i,l in enumerate(open(self.tmp_file)):
row = l.rstrip().split()
if i==0: continue
s = self.samples[i-1]
seq = "".join(row)
O.write(">%s\n%s\n" % ( s,seq))
run_cmd("rm %s" % self.tmp_file)
def nofile(filename):
"""
Return True if file does not exist
"""
if not os.path.isfile(filename):
return True
else:
return False
def run_cmd(cmd,verbose=1,target=None):
"""
Wrapper to run a command using subprocess with 3 levels of verbosity and automatic exiting if command failed
"""
if target and (target): return True
cmd = "set -u pipefail; " + cmd
if verbose==2:
sys.stderr.write("\nRunning command:\n%s\n" % cmd)
stdout = open("/dev/stdout","w")
stderr = open("/dev/stderr","w")
elif verbose==1:
sys.stderr.write("\nRunning command:\n%s\n" % cmd)
stdout = open("/dev/null","w")
stderr = open("/dev/null","w")
else:
stdout = open("/dev/null","w")
stderr = open("/dev/null","w")
res = subprocess.call(cmd,shell=True,stderr = stderr,stdout = stdout)
stderr.close()
if res!=0:
sys.stderr.write("Command Failed! Please Check!")
exit(1)
def main(args):
vcf_obj = vcf_class(args.vcf)
if args.snps:
vcf_obj.vcf_to_fasta(args.ref)
else:
args.sample_file = get_random_file()
open(args.sample_file,"w").write("\n".join(vcf_obj.samples)+"\n")
run_cmd('cat %(sample_file)s | parallel --bar -j %(threads)s "bcftools consensus -f %(ref)s -s {} %(vcf)s | sed \'s/^>.*/>{}/\' > {}.tmp.fasta"' % vars(args))
run_cmd('cat %s > %s.fa' % (" ".join(["%s.tmp.fasta" % s for s in vcf_obj.samples]), vcf_obj.prefix))
run_cmd('rm %s %s' % (" ".join(["%s.tmp.fasta" % s for s in vcf_obj.samples]), args.sample_file))
parser = argparse.ArgumentParser(description='VCF mergin pipeline',formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--vcf',help='sample file',required=True)
parser.add_argument('--ref',help='reference file',required=True)
parser.add_argument('--snps',action="store_true",help='Only use SNPs')
parser.add_argument('--threads','-t',default=4,help='Number of threads')
parser.set_defaults(func=main)
args = parser.parse_args()
args.func(args)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment