Skip to content

Instantly share code, notes, and snippets.

@gedankenstuecke
Created February 13, 2012 23:34
Show Gist options
  • Save gedankenstuecke/1821464 to your computer and use it in GitHub Desktop.
Save gedankenstuecke/1821464 to your computer and use it in GitHub Desktop.
compare 2 23andMe plaintext files to get the number of SNPs which have the identical genotype over both files
# usage: just run "./compare_snps.py 23andme_file_1.txt 23andme_file_2.txt"
import sys
def read_23andme(filename):
handle = open(filename,"r")
snps = {}
for line in handle:
if line[0] != "#":
line_array = line.split("\t")
snps[line_array[0]] = line_array[3].rstrip()
return snps
def compare_genotypes(first_snps,second_snps):
total_snps = 0
total_in_both = 0
same_snps = 0
different_snps = 0
if len(first_snps) > len(second_snps):
longer = "first"
longer_snps = first_snps
shorter_snps = second_snps
else:
longer = "second"
longer_snps = second_snps
shorter_snps = first_snps
for snp in longer_snps:
if shorter_snps.has_key(snp):
if len(longer_snps[snp]) > 1:
reverse_longer = longer_snps[snp][1]+longer_snps[snp][0]
else:
reverse_longer = longer_snps[snp]
if longer_snps[snp] == shorter_snps[snp]:
same_snps += 1
elif reverse_longer == shorter_snps[snp]:
same_snps += 1
else:
different_snps += 1
total_in_both += 1
else:
print snp + " is missing from the shorter file"
total_snps += 1
print "The "+longer+" file includes more SNPS"
print "In total "+str(total_snps)+" SNPs were tested of which "+str(total_in_both)+" can be found in both files"
print str(same_snps) +" of those SNPs have the same genotype between both files."
print str(different_snps) + " of those SNPs have different genotypes between both files"
first_snps = read_23andme(sys.argv[1])
second_snps = read_23andme(sys.argv[2])
compare_genotypes(first_snps,second_snps)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment