Last active
October 17, 2018 19:21
-
-
Save RyanSchu/210f07c3a70dc375cc21cddffb990fa5 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#origianlly written by Brianne Coffey Github@breecoffey | |
#modified by Ryan Schubert | |
import argparse | |
import gzip | |
import re | |
from os.path import expanduser | |
current = expanduser("~") | |
parser = argparse.ArgumentParser(description='Input & Output Files') #create the argument parser | |
parser.add_argument('--VCF', help='The VCF file to open') #variable for VCF file | |
parser.add_argument('--maf', help='Minor allele frequency threshold', const=1, type = float) | |
parser.add_argument('--r2', help='R2 threshold', const=1, type = float) | |
parser.add_argument('--outputdir', "-o", default=current, help='The VCF file to open') #variable for VCF file | |
parser.add_argument('--tag", default=current, help='tag') #variable for VCF file | |
args = parser.parse_args() #parse the arguments | |
#geno = "SNPGenotypes_" + args.VCF | |
#snpsloc = "SNPLoc_" + args.VCF | |
filtered = [] #array to store filtered snp info | |
loc_output = [] #array for snp chrom location output file | |
with gzip.open(args.VCF, "rt") as file: #open the vcf file | |
for line in file: | |
if line.startswith('##'): #strip the meta-information | |
continue | |
if line.startswith('#'): #store the header | |
header = line | |
continue | |
words = line.strip('\n').split('\t') #split row by tab | |
#chrom_num = words[0] #grab chromosome number | |
#chrom_loc = words[1] #grab chromosome location | |
#snp_id = words[2] #grab snp id | |
#ref_allele = words[3] | |
# alt_allele = words[4] | |
#loc_output.append(str(snp_id) + '\t' + "chr"+str(chrom_num) + '\t' + str(chrom_loc)) #append together for the location output file | |
R2_index = -1 | |
af_index = -1 | |
values = words[7].split(';') #grab the 7th column with GT info for each sample | |
af_index = 2 #set initial allele freq to index 2 | |
if(str(values[2][0:4]) == 'MAF='): #check first 3 indicies for AF info, because some snps are missing first 2 values | |
af_index = 2 | |
if(str(values[1][0:4]) == 'MAF='): | |
af_index = 1 | |
if(str(values[0][0:4]) == 'MAF='): | |
af_index = 0 | |
if(str(values[2][0:3]) == 'R2='): #check first 3 indicies for AF info, because some snps are missing first 2 values | |
R2_index = 2 | |
if(str(values[1][0:3]) == 'R2='): | |
R2_index = 1 | |
if(str(values[0][0:3]) == 'R2='): | |
R2_index = 0 | |
R2_float = [float(i) for i in values[R2_index][3:].split(',')] | |
maf_float = [float(i) for i in values[af_index][3:].split(',')] | |
if (af_index != -1 and R2_index != -1) and (max(maf_float) >= args.maf and max(R2_float) >= args.R2): #check if AF is above 0.01 frequency | |
unfiltered_dose = words[9:] #grab unfiltered genotype info | |
dosages = [] #create array for all genotype values to be stored | |
#unfiltered_geno = unfiltered_geno[2:] | |
for i in unfiltered_dose: | |
j = i.split(':') #split info by colon | |
dose = j[1] | |
dosages.append(dose) | |
filt = '\t'.join(map(str, words[0:5])) #convert snpid to string | |
if dosages == []: #if no values, go to next snp | |
continue | |
for k in dosages: | |
filt += '\t' + str(k) | |
filtered.append(filt) #append filtered DS values by tab delimited | |
loc_output.append(str(snp_id) + '\t' + "chr"+str(chrom_num) + '\t' + str(chrom_loc)) | |
#write to output file | |
dose_out = args.outputdir + "/Chr" + arg.tag + "MAF" + str(args.maf) + "R2" + str(args.r2) + "Dosages.txt.gz" | |
#snpsloc = args.outputdir + "SNPLoc.txt.gz" | |
with gzip.open(dose_out, 'wb') as output_file: | |
samples = header.split('\t') #sample IDs | |
output_file.write(('id').encode('utf-8')) | |
samples = samples[9:] #remove first columns from header | |
for name in samples: | |
output_file.write(('\t'+str(name)).encode('utf-8')) | |
for i in filtered: | |
output_file.write((str(i) + '\n').encode('utf-8')) #write out filtered genotype info | |
#with gzip.open(snpsloc, 'wb') as output_file: | |
# output_file.write(("snp\tchr\tpos" + '\n').encode('utf-8')) #concat info for chrom position | |
# for snp in loc_output: | |
# output_file.write((snp + '\n').encode('utf-8')) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment