Skip to content

Instantly share code, notes, and snippets.

@nickloman
Created November 27, 2012 20:13
Show Gist options
  • Select an option

  • Save nickloman/4156719 to your computer and use it in GitHub Desktop.

Select an option

Save nickloman/4156719 to your computer and use it in GitHub Desktop.
rarify_from_vcf.py-- Subsample SNP discovery rates for subset of isolates
## rarify SNP discovery:
## n isolates from 1 .. total isolates
## x bootstraps (pick randomly without replacement)
## output:
## NumberIsolates BootStrapNumber Variants NoCalls
import vcf
import sys
import random
from collections import defaultdict
records = [rec for rec in vcf.Reader(filename=sys.argv[1]) if rec.REF != 'N']
genotypes = defaultdict(list)
for rec in records:
for n, sample in enumerate(rec.samples):
genotypes[n].append(sample['GT'])
number_of_samples = len(records[0].samples)
print >>sys.stderr, "%d samples" % (number_of_samples)
print "IsolateCount\tBootstrapIteration\tHomSNPs\tNoCalls\tHetSNPs"
for number_of_isolates in xrange(1, number_of_samples+1):
for bootstrapn in xrange(0, 100):
random.shuffle(genotypes)
snp_set = set()
nocall_set = set()
het_set = set()
for n in xrange(number_of_isolates):
for pos, g in enumerate(genotypes[n]):
if g == '1/1':
snp_set.add("%s-%s" % (pos, g))
elif g is None:
nocall_set.add("%s-%s" % (pos, g))
elif g != '0/0':
het_set.add("%s-%s" % (pos, g))
print "%s\t%s\t%s\t%s\t%s" % (number_of_isolates, bootstrapn, len(snp_set), len(nocall_set), len(het_set))
~
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment