nickloman · November 27, 2012 20:13
diff --git a/rarify_from_vcf.py b/rarify_from_vcf.py

 ## rarify SNP discovery:

 ## n isolates from 1 .. total isolates
 ## x bootstraps (pick randomly without replacement)

 ## output:
 ## NumberIsolates   BootStrapNumber  Variants  NoCalls


 import vcf
 import sys
 import random
 from collections import defaultdict

 records = [rec for rec in vcf.Reader(filename=sys.argv[1]) if rec.REF != 'N']

 genotypes = defaultdict(list)
 for rec in records:
        for n, sample in enumerate(rec.samples):
                genotypes[n].append(sample['GT'])

 number_of_samples = len(records[0].samples)
 print >>sys.stderr,  "%d samples" % (number_of_samples)

 print "IsolateCount\tBootstrapIteration\tHomSNPs\tNoCalls\tHetSNPs"

 for number_of_isolates in xrange(1, number_of_samples+1):
        for bootstrapn in xrange(0, 100):
                random.shuffle(genotypes)
                snp_set = set()
                nocall_set = set()
                het_set = set()
                for n in xrange(number_of_isolates):
                        for pos, g in enumerate(genotypes[n]):
                                if g == '1/1':
                                        snp_set.add("%s-%s" % (pos, g))
                                elif g is None:
                                        nocall_set.add("%s-%s" % (pos, g))
                                elif g != '0/0':
                                        het_set.add("%s-%s" % (pos, g))
                print "%s\t%s\t%s\t%s\t%s" % (number_of_isolates, bootstrapn, len(snp_set), len(nocall_set), len(het_set))
 ~

	## rarify SNP discovery:

	## n isolates from 1 .. total isolates
	## x bootstraps (pick randomly without replacement)

	## output:
	## NumberIsolates BootStrapNumber Variants NoCalls


	import vcf
	import sys
	import random
	from collections import defaultdict

	records = [rec for rec in vcf.Reader(filename=sys.argv[1]) if rec.REF != 'N']

	genotypes = defaultdict(list)
	for rec in records:
	for n, sample in enumerate(rec.samples):
	genotypes[n].append(sample['GT'])

	number_of_samples = len(records[0].samples)
	print >>sys.stderr, "%d samples" % (number_of_samples)

	print "IsolateCount\tBootstrapIteration\tHomSNPs\tNoCalls\tHetSNPs"

	for number_of_isolates in xrange(1, number_of_samples+1):
	for bootstrapn in xrange(0, 100):
	random.shuffle(genotypes)
	snp_set = set()
	nocall_set = set()
	het_set = set()
	for n in xrange(number_of_isolates):
	for pos, g in enumerate(genotypes[n]):
	if g == '1/1':
	snp_set.add("%s-%s" % (pos, g))
	elif g is None:
	nocall_set.add("%s-%s" % (pos, g))
	elif g != '0/0':
	het_set.add("%s-%s" % (pos, g))
	print "%s\t%s\t%s\t%s\t%s" % (number_of_isolates, bootstrapn, len(snp_set), len(nocall_set), len(het_set))
	~
No results found