-
-
Save brantfaircloth/10688524 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
""" | |
(c) 2014 Brant Faircloth || http://faircloth-lab.org/ | |
All rights reserved. | |
This code is distributed under a 3-clause BSD license. Please see | |
LICENSE.txt for more information. | |
Created on 14 April 2014 12:57 PDT (-0700) | |
""" | |
import sys | |
import vcf | |
import argparse | |
import pandas as pd | |
#import pdb | |
def get_args(): | |
parser = argparse.ArgumentParser(description='Parse fastq files and drop reads containing Ns.') | |
parser.add_argument( | |
'--input', | |
required=True, | |
help="The VCF file to process" | |
) | |
parser.add_argument( | |
'--output', | |
required=True, | |
help="The STRUCTURE file to write" | |
) | |
parser.add_argument( | |
"--filter-identical", | |
action="store_true", | |
default=False, | |
help="""Remove sites with only a single heterozygote genotype.""", | |
) | |
return parser.parse_args() | |
def identical(items): | |
return all(x == items[0] for x in items if x != "-9,-9") | |
def progress(cnt): | |
if cnt % 100 == 0: | |
sys.stdout.write(".") | |
sys.stdout.flush() | |
def main(): | |
args = get_args() | |
alleles = {"A":1, "T":2, "G":3, "C":4} | |
sys.stdout.write("Working") | |
sys.stdout.flush() | |
with open(args.input, 'r') as infile: | |
vcf_reader = vcf.Reader(infile) | |
# create data frame with indiv as row id | |
data = pd.DataFrame() | |
for cnt, record in enumerate(vcf_reader): | |
# prog | |
progress(cnt) | |
# get ref/alt alleles and make those into a list - integer values | |
# in genotypes refer to 0-index positions of this list. | |
reference = [record.REF] + [str(i) for i in record.ALT] | |
# we'll keep a list of all genotypes | |
all_genotypes = [] | |
for sample in record.samples: | |
# deal with missing data | |
if sample.data.GT == None: | |
gt = ",".join(["-9","-9"]) | |
# bung those into list | |
all_genotypes.append(gt) | |
else: | |
# split the integer code | |
coded_gt = [int(i) for i in sample.data.GT.split("/")] | |
# get the ref/alt allele based on integer code and lookup | |
# new integer value in dict/hash table | |
gt = sorted([alleles[reference[i]] for i in coded_gt]) | |
# join those to a string | |
gt = ",".join([str(i) for i in gt]) | |
# bung those into list | |
all_genotypes.append(gt) | |
# filter monomorphic loci | |
if args.filter_identical and not identical(all_genotypes): | |
temp_df = pd.DataFrame({cnt:pd.Series(all_genotypes,index=vcf_reader.samples)}) | |
data = pd.concat([data, temp_df], axis=1) | |
elif not args.filter_identical: | |
temp_df = pd.DataFrame({cnt:pd.Series(all_genotypes,index=vcf_reader.samples)}) | |
data = pd.concat([data, temp_df], axis=1) | |
with open(args.output, 'w') as outf: | |
loci_header = ",".join(["{0}a,{0}b".format(i) for i in data.columns.values.tolist()]) | |
outf.write("indiv,{}\n".format(loci_header)) | |
for row in data.iterrows(): | |
outf.write("{0},{1}\n".format( | |
row[0], | |
",".join(row[1]) | |
)) | |
# flush | |
print "" | |
print "Data contain {} individuals and {} loci output to {}.".format( | |
len(data), | |
len(data.columns.values.tolist()), | |
args.output | |
) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment