kantale · November 13, 2017 13:10
diff --git a/project.py b/project.py
 '''
 Copyright (c) 2017, Alexandros Kanterakis kantale@ics.forth.gr
 All rights reserved.

 Simplified BSD License:

 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are met:

 1. Redistributions of source code must retain the above copyright notice, this
   list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright notice,
   this list of conditions and the following disclaimer in the documentation
   and/or other materials provided with the distribution.

 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
 ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 '''

 import re
 import os
 import json
 import time
 import numpy as np
 import argparse
 import requests

 import progressbar # 

 from pyliftover import LiftOver

 import matplotlib.pyplot as plt

 from scipy import stats

 # global liftover variable
 lo = None

 def print_now():
 	'''
 	prints the current time.
 	useful for progress information
 	'''
 	return time.strftime("%a, %d %b %Y %H:%M:%S", time.gmtime())

 def load(filename):
 	print ('Opening file:', filename)

 	data_to_return = []

 	with open(filename) as f:
 		line_counter = 0
 		bar = progressbar.ProgressBar(max_value=progressbar.UnknownLength)
 		for line in f:
 			line_counter += 1

 #			if line_counter > 10000:
 #				break

 			if line_counter % 1000 == 0:
 				bar.update(line_counter)

 			line_splitted = line.replace('\n', '').split()

 			snp_id = line_splitted[0]
 			rs_id = line_splitted[1]
 			location = int(line_splitted[2])
 			reference = line_splitted[3]
 			alternative = line_splitted[4]

 			this_variant_hash = {
 				('1','0','0'): (reference, reference),
 				('0','1','0'): (reference, alternative),
 				('0','0','1'): (alternative, alternative)
 			}

 			numeric_genotypes = line_splitted[5:]

 			#Xwrizoume to all_genotypes se triades kai metatrepoume ta (0,0,1) se (reference, reference), ktl
 			allelic_genotypes = [this_variant_hash[(numeric_genotypes[i], numeric_genotypes[i+1], numeric_genotypes[i+2])] for i in range(0, len(numeric_genotypes), 3)]

 			this_variant = {
 				's_id': snp_id,
 				'r_id': rs_id,
 				'l': location, 
 				'r': reference, 
 				'a': alternative,
 				'g': allelic_genotypes,
 			}

 			data_to_return.append(this_variant)

 	return data_to_return


 def save(data, filename):
 	'''
 	Save data
 	'''

 	print ('\nSaving data to:', filename)
 	with open(filename, 'w') as f:
 		line_counter = 0
 		bar = progressbar.ProgressBar(max_value=len(data)+1)
 		for d in data:

 			line_counter += 1

 			if line_counter % 1000 == 0:
 				bar.update(line_counter)

 			line = [
 				d['s_id'], d['r_id'], str(d['l']), d['r'], d['a']
 			]

 			this_variant_hash = {
 				(d['r'], d['r']) : ['1', '0', '0'],
 				(d['r'], d['a']) : ['0', '1', '0'],
 				(d['a'], d['a']) : ['0', '0', '1'],
 			}

 			for alleleic_genotype in d['g']:
 				line.extend(this_variant_hash[alleleic_genotype])

 			f.write(' '.join(line) + '\n')


 def read_input(options):

 	print ('\nLOADING CASES:')
 	cases = load(options.cases_file)

 	print ('\nLOADING CONTROLS:')
 	controls = load(options.controls_file)

 	return cases, controls

 def keep_snps_data(data, keep_snps, output_filename):

 	data_new = [x for x in data if x['s_id'] in keep_snps]
 	save(data_new, output_filename)

 def remove_snps_data(data, remove_snps, output_filename):
 	data_new = [x for x in data if not x['s_id'] in remove_snps]
 	save(data_new, output_filename)

 def read_snp_list(filename):
 	with open(filename) as f:
 		snps = {x.replace('\n', ''): None for x in f.readlines()}

 	print ('Read {} SNPs from {}'.format(len(snps), filename))

 	return snps

 def keep_snps(cases, controls, output, keep_snps_filename):

 	keep_snps = read_snp_list(keep_snps_filename)

 	keep_snps_data(cases, keep_snps, output + '.keep_snps.cases')
 	keep_snps_data(controls, keep_snps, output + '.keep_snps.controls')

 def remove_snps(cases, controls, output, remove_snps_filename):

 	remove_snps = read_snp_list(remove_snps_filename)

 	remove_snps_data(cases, remove_snps, output + '.remove_snps.cases')
 	remove_snps_data(controls, remove_snps, output + '.remove_snps.controls')

 def read_sample_list(filename):

 	sample_ids = {
 		'cases': [],
 		'controls': [],
 	}

 	with open(filename) as f:

 		for line in f:
 			sample = line.replace('\n', '')
 			sample_splitted = sample.split('_')

 			if len(sample_splitted) != 2:
 				raise Exception('Unknown sample format: {}'.format(sample))

 			cases_controls = sample_splitted[0]
 			if not cases_controls in ['case', 'control']:
 				raise Exception('First part of sample id sould be case or control')

 			cases_controls += 's' # control --> controls, case --> cases

 			try:
 				case_id = int(sample_splitted[1])
 			except NameError as e:
 				print ('Second part of sample id should be an integer')
 				raise e

 			sample_ids[cases_controls].append(case_id-1) 

 		# Remove possible duplicate

 	for case_control in sample_ids:
 		sample_ids[case_control] = list(set(sample_ids[case_control]))

 	print ('Read {} case samples and {} control samples from {}'.format(len(sample_ids['cases']), len(sample_ids['controls']), filename))
 	return sample_ids

 def keep_sample_data(data, samples_to_keep, output_filename):

 	for d in data:
 		d['g'] = [g for i, g in enumerate(d['g']) if i in samples_to_keep]

 	save(data, output_filename)

 def keep_samples(cases, controls, output, keep_samples_filename):

 	samples = read_sample_list(keep_samples_filename)
 	keep_sample_data(cases, samples['cases'], output + '.keep_samples.cases')
 	keep_sample_data(controls, samples['controls'], output + '.keep_samples.controls')


 def remove_sample_data(data, samples_to_remove, output_filename):

 	for d in data:
 		d['g'] = [g for i, g in enumerate(d['g']) if not i in samples_to_remove]

 	save(data, output_filename)

 def remove_samples(cases, controls, output, remove_samples_filename):

 	samples = read_sample_list(remove_samples_filename)
 	remove_sample_data(cases, samples['cases'], output + '.remove_samples.cases')
 	remove_sample_data(controls, samples['controls'], output + '.remove_samples.controls')

 def allele_frequency(cases, controls, output):

 	output_filename = output + '.frequency'
 	print ('\nSaving frequencies to:', output_filename)

 	with open(output_filename, 'w') as f:
 		line_counter = 0
 		bar = progressbar.ProgressBar(max_value=len(cases)+1)
 		for case, control in zip(cases, controls):

 			line_counter += 1

 			if line_counter % 1000 == 0:
 				bar.update(line_counter)

 			to_print = [case['s_id']]

 			reference = case['r']
 			alternative = case['a']

 			all_cases = len(case['g'])
 			all_controls = len(control['g'])

 			# H suxnothta tou reference sta controls
 			ref_controls = sum([genotype.count(reference) for genotype in control['g']]) 
 			#/ (2.0 * len(control['g']))

 			# H suxnothta tou alternative sta controls
 			alt_controls = sum([genotype.count(alternative) for genotype in control['g']]) 
 			#/ (2.0 * len(control['g']))
 			
 			# H suxnothta tou reference sta cases
 			ref_cases = sum([genotype.count(reference) for genotype in case['g']]) 
 			#/ (2.0 * len(case['g']))

 			# H suxnothta tou alternative sta cases
 			alt_cases = sum([genotype.count(alternative) for genotype in case['g']]) 
 			#/ (2.0 * len(case['g']))

 			# H suxnotuta tou reference sta controls + cases
 			ref_all = ref_controls + ref_cases

 			# H suxnothta tou alternative sta controls + cases
 			alt_all = alt_controls + alt_cases

 			to_print.extend([
 				ref_controls / (2.0 * all_controls),
 				alt_controls / (2.0 * all_controls),
 				ref_cases / (2.0 * all_cases),
 				alt_cases / (2.0 * all_cases),
 				ref_all / (2.0 * (all_controls+all_cases)),
 				])

 			f.write(' '.join(map(str, to_print)) + '\n')

 	return output_filename

 def Hardy_Weinberg_Equilibrium_asymptotic_test(obs_hets, obs_hom1, obs_hom2):
        '''
        Compute the Hardy-Weinberg Equilibrium of a SNP by using an asymptotic test. 
        This is equivalent with the value produced with the --hardy2 option of plink 
        software (http://pngu.mgh.harvard.edu/~purcell/plink/summary.shtml#hardy). 
        '''

        xx = obs_hom1
        xy = obs_hets
        yy = obs_hom2

        if xy == 0 and yy == 0:
        	return 0.0

        if xx ==0 and xy == 0:
        	return 0.0

        samples = xx + xy + yy
        observed_heterozygosity = float(xy) / samples

        alleleX_count = (2*xx) + xy
        alleleX_ratio = float(alleleX_count) / (2 * samples)
        alleleY_count = xy + (2*yy)
        alleleY_ratio = float(alleleY_count) / (2 * samples)

        expected_xx_n = alleleX_ratio * alleleX_ratio * samples
        expected_xy_n = 2.0 * alleleX_ratio * alleleY_ratio * samples
        expected_yy_n = alleleY_ratio * alleleY_ratio * samples
        expected_total = expected_xx_n + expected_xy_n + expected_yy_n

        expected_xx_ratio = float(expected_xx_n) / expected_total
        expected_xy_ratio = float(expected_xy_n) / expected_total
        expected_yy_ratio = float(expected_yy_n) / expected_total

        chi_square_xx = ((xx - expected_xx_n) ** 2.0) / expected_xx_n
        chi_square_xy = ((xy - expected_xy_n) ** 2.0) / expected_xy_n
        chi_square_yy = ((yy - expected_yy_n) ** 2.0) / expected_yy_n

        chi_square_total = chi_square_xx + chi_square_xy + chi_square_yy

        return stats.chisqprob(chi_square_total, 1)

 def Hardy_Weinberg_Equilibrium_exact_test(obs_hets, obs_hom1, obs_hom2):
        '''
 According to the comments:
    This code implements an exact SNP test of Hardy-Weinberg Equilibrium as described in Wigginton, JE, Cutler, DJ, and Abecasis, GR (2005) A Note on Exact Tests of Hardy-Weinberg Equilibrium. 
    American Journal of Human Genetics. 76: 000 - 000 
    Written by Jan Wigginton 

 Converted to python from the C/C++ code from: http://www.sph.umich.edu/csg/abecasis/Exact/c_instruct.html from Alexandros Kanterakis (kantale@ics.forth.gr)

 This test is used by plink when applying the --hardy option (http://pngu.mgh.harvard.edu/~purcell/plink/summary.shtml#hardy) and from Beagle utilities (http://faculty.washington.edu/browning/beagle_utilities/utilities.html) to compute Hardy-Weinberg Equilibrium.         
        '''
        if obs_hom1 < 0 or obs_hom2 < 0 or obs_hets < 0:
                raise Exception("FATAL ERROR - SNP-HWE: Current genotype configuration (%s  %s %s) includes negative count" % (obs_hets, obs_hom1, obs_hom2))

        obs_homc = obs_hom2 if obs_hom1 < obs_hom2 else obs_hom1
        obs_homr = obs_hom1 if obs_hom1 < obs_hom2 else obs_hom2

        rare_copies = 2 * obs_homr + obs_hets
        genotypes   = obs_hets + obs_homc + obs_homr

        het_probs = [0.0] * (rare_copies + 1)

        #start at midpoint
        mid = rare_copies * (2 * genotypes - rare_copies) // (2 * genotypes)

        #check to ensure that midpoint and rare alleles have same parity
        if (rare_copies & 1) ^ (mid & 1):
                mid += 1

        curr_hets = mid
        curr_homr = (rare_copies - mid) // 2
        curr_homc = genotypes - curr_hets - curr_homr

        het_probs[mid] = 1.0
        sum = float(het_probs[mid])

        for curr_hets in range(mid, 1, -2):

                het_probs[curr_hets - 2] = het_probs[curr_hets] * curr_hets * (curr_hets - 1.0) / (4.0 * (curr_homr + 1.0) * (curr_homc + 1.0))

                sum += het_probs[curr_hets - 2];

                # 2 fewer heterozygotes for next iteration -> add one rare, one common homozygote
                curr_homr += 1
                curr_homc += 1

        curr_hets = mid
        curr_homr = (rare_copies - mid) // 2
        curr_homc = genotypes - curr_hets - curr_homr

        for curr_hets in range(mid, rare_copies - 1, 2):

                het_probs[curr_hets + 2] = het_probs[curr_hets] * 4.0 * curr_homr * curr_homc / ((curr_hets + 2.0) * (curr_hets + 1.0))

                sum += het_probs[curr_hets + 2]

                #add 2 heterozygotes for next iteration -> subtract one rare, one common homozygote
                curr_homr -= 1
                curr_homc -= 1

        for i in range(0, rare_copies + 1):
                het_probs[i] /= sum

        #alternate p-value calculation for p_hi/p_lo
        p_hi = float(het_probs[obs_hets])
        for i in range(obs_hets, rare_copies+1):
                p_hi += het_probs[i]

        p_lo = float(het_probs[obs_hets])
        for i in range(obs_hets-1, -1, -1):
                p_lo += het_probs[i]

        p_hi_lo = 2.0 * p_hi if p_hi < p_lo else 2.0 * p_lo

        p_hwe = 0.0
        #  p-value calculation for p_hwe
        for i in range(0, rare_copies + 1):
                if het_probs[i] > het_probs[obs_hets]:
                        continue;
                p_hwe += het_probs[i]

        p_hwe = 1.0 if p_hwe > 1.0 else p_hwe

        return p_hwe

 def hwe(cases, controls, output, test_name):

 	output_filename = output + '.hwe_' + test_name
 	print ('\nSaving HWE to:', output_filename)

 	test_function = {
 		'asymptotic': Hardy_Weinberg_Equilibrium_asymptotic_test,
 		'exact': Hardy_Weinberg_Equilibrium_exact_test,
 	}[test_name]

 	with open(output_filename, 'w') as f:
 		line_counter = 0

 		bar = progressbar.ProgressBar(max_value=len(cases)+1)
 		for case, control in zip(cases, controls):

 			line_counter += 1
 			if line_counter % 1000 == 0:
 				bar.update(line_counter)

 			to_print = [case['s_id']]

 			homozygous_1 = (case['r'], case['r'])
 			homozygous_2 = (case['a'], case['a'])
 			heterozygous = (case['r'], case['a'])
 			all_genotypes = case['g'] + control['g']

 			obs_hets = all_genotypes.count(heterozygous)
 			obs_hom1 = all_genotypes.count(homozygous_1)
 			obs_hom2 = all_genotypes.count(homozygous_2)

 			assert obs_hets + obs_hom1 + obs_hom2 == len(all_genotypes)

 			p_value = test_function(obs_hets, obs_hom1, obs_hom2)
 			to_print.append(p_value)

 			f.write(' '.join(map(str, to_print)) + '\n')
 	return output_filename

 def SNP_alleles(
 	SNP = None,
 	missing = "0",
 ):
 	'''
 	Get the two different alleles of a SNP. The parameter SNP is a list of tuples with genotypes. 
 	For example if SNP = [(A,A), (A,G), (G,G), (G,A)] the function returns (A,G) 
 	'''
 	allele_1 = None

 	for gen in SNP:
 		if gen[0] != missing:
 			if not allele_1:
 				allele_1 = gen[0]
 			elif gen[0] != allele_1:
 				return (allele_1, gen[0])
 		if gen[1] != missing:
 			if not allele_1:
 				allele_1 = gen[1]
 			elif gen[1] != allele_1:
 				return (allele_1, gen[1])

 	return (allele_1, allele_1)


 def Pairwise_linkage_disequilibrium(
 	SNP_1 = None,
 	SNP_2 = None,
 	):
 	'''
 	Computes the linkage disequilibrium between two SNPs. Computes the r-squared, D' and the estimated haplotype frequencies. 
 	It has been developed to produce the same values as plink with --ld option. (http://pngu.mgh.harvard.edu/~purcell/plink/ld.shtml) 

 	Returns a dictionary with the following keys:

    "haplotypes" : A tuple with three values: (1) the estimated haplotype, (2) the haplotype frequencies and (3) frequencies expectation under linkage equilibrium.
    "R_sq" : The r-square.
    "Dprime" : The D prime. 


 	'''


 	def Minor_allele_frequency(
 	        allele1=None,
 	        allele2=None,
 	        observations_chr1 = None,
 	        observations_chr2 = None
 	):

 	        """
 	        Calculates the Minor Allele Frequency of a SNP. 
 	        >>> Minor_allele_frequency("A", "C", ["A", "A", "C", "C", "C"], ["A", "C", "C", "C","C"])
 	        (0.3, 'A', 10)
 	        """

 	        allele1_chr1 = observations_chr1.count(allele1)
 	        allele2_chr1 = observations_chr1.count(allele2)

 	        if observations_chr2:
 	                allele1_chr2 = observations_chr2.count(allele1)
 	                allele2_chr2 = observations_chr2.count(allele2)
 	        else:
 	                allele1_chr2 = 0
 	                allele2_chr2 = 0

 	        observations = allele1_chr1 + allele2_chr1 + allele1_chr2 + allele2_chr2
 	        allele1_maf = float((allele1_chr1 + allele1_chr2)) / observations

 	        if allele1_maf < 0.5:
 	                return (allele1_maf, allele1, observations)
 	        else:
 	                return (1.0 - allele1_maf, allele2, observations)	


 	ret = {}

 	(SNP_1_allele_1, SNP_1_allele_2) = SNP_alleles(SNP_1)
 	(SNP_2_allele_1, SNP_2_allele_2) = SNP_alleles(SNP_2)

 	Haplotype_1_1 = [x[0] for x in SNP_1]
 	Haplotype_1_2 = [x[1] for x in SNP_1]
 	Haplotype_2_1 = [x[0] for x in SNP_2]
 	Haplotype_2_2 = [x[1] for x in SNP_2]

 	MAF_1 = Minor_allele_frequency(SNP_1_allele_1, SNP_1_allele_2, Haplotype_1_1, Haplotype_1_2)
 	MAF_2 = Minor_allele_frequency(SNP_2_allele_1, SNP_2_allele_2, Haplotype_2_1, Haplotype_2_2)

 	if MAF_1[1] == SNP_1_allele_1:
 		freq_1_1 = MAF_1[0]
 		freq_1_2 = 1.0 - freq_1_1
 	elif MAF_1[1] == SNP_1_allele_2:
 		freq_1_2 = MAF_1[0]
 		freq_1_1 = 1.0 - freq_1_2

 	if MAF_2[1] == SNP_2_allele_1:
 		freq_2_1 = MAF_2[0]
 		freq_2_2 = 1.0 - freq_2_1
 	elif MAF_2[1] == SNP_2_allele_2:
 		freq_2_2 = MAF_2[0]
 		freq_2_1 = 1.0 - freq_2_2

 	freq_A_B = freq_1_1 * freq_2_1
 	freq_A_b = freq_1_1 * freq_2_2
 	freq_a_B = freq_1_2 * freq_2_1
 	freq_a_b = freq_1_2 * freq_2_2

 	Haps = list(zip(Haplotype_1_1, Haplotype_1_2, Haplotype_2_1, Haplotype_2_2))

 	N11 = len([1 for h11, h12, h21, h22 in Haps if   h11 == SNP_1_allele_1 and h12 == SNP_1_allele_1                                                        and   h21 == SNP_2_allele_1 and h22 == SNP_2_allele_1])
 	N21 = len([1 for h11, h12, h21, h22 in Haps if ((h11 == SNP_1_allele_1 and h12 == SNP_1_allele_2) or (h11 == SNP_1_allele_2 and h12 == SNP_1_allele_1)) and   h21 == SNP_2_allele_1 and h22 == SNP_2_allele_1])
 	N31 = len([1 for h11, h12, h21, h22 in Haps if   h11 == SNP_1_allele_2 and h12 == SNP_1_allele_2                                                        and   h21 == SNP_2_allele_1 and h22 == SNP_2_allele_1])
 	N12 = len([1 for h11, h12, h21, h22 in Haps if   h11 == SNP_1_allele_1 and h12 == SNP_1_allele_1                                                        and ((h21 == SNP_2_allele_1 and h22 == SNP_2_allele_2) or (h21 == SNP_2_allele_2 and h22 == SNP_2_allele_1))])
 	N22 = len([1 for h11, h12, h21, h22 in Haps if ((h11 == SNP_1_allele_1 and h12 == SNP_1_allele_2) or (h11 == SNP_1_allele_2 and h12 == SNP_1_allele_1)) and ((h21 == SNP_2_allele_1 and h22 == SNP_2_allele_2) or (h21 == SNP_2_allele_2 and h22 == SNP_2_allele_1))])
 	N32 = len([1 for h11, h12, h21, h22 in Haps if   h11 == SNP_1_allele_2 and h12 == SNP_1_allele_2                                                        and ((h21 == SNP_2_allele_1 and h22 == SNP_2_allele_2) or (h21 == SNP_2_allele_2 and h22 == SNP_2_allele_1))])
 	N13 = len([1 for h11, h12, h21, h22 in Haps if   h11 == SNP_1_allele_1 and h12 == SNP_1_allele_1                                                        and   h21 == SNP_2_allele_2 and h22 == SNP_2_allele_2])
 	N23 = len([1 for h11, h12, h21, h22 in Haps if ((h11 == SNP_1_allele_1 and h12 == SNP_1_allele_2) or (h11 == SNP_1_allele_2 and h12 == SNP_1_allele_1)) and   h21 == SNP_2_allele_2 and h22 == SNP_2_allele_2])
 	N33 = len([1 for h11, h12, h21, h22 in Haps if   h11 == SNP_1_allele_2 and h12 == SNP_1_allele_2                                                        and   h21 == SNP_2_allele_2 and h22 == SNP_2_allele_2])

 	N1_ = N11 + N12 + N13
 	N2_ = N21 + N22 + N23
 	N3_ = N31 + N32 + N33

 	N_1 = N11 + N21 + N31
 	N_2 = N12 + N22 + N32
 	N_3 = N13 + N23 + N33

 	N = N1_ + N2_ + N3_

 	pAB = 0.25
 	pab = 0.25
 	pAb = 0.25
 	paB = 0.25

 	for nn in range(10): # 10 should be enough
 		nAB = (2*N11) + N12 + N21 + (( (pAB*pab) / ((pAb*paB) + (pAB*pab)) ) * N22)
 		nab = (2*N33) + N23 + N32 + (( (pAB*pab) / ((pAb*paB) + (pAB*pab)) ) * N22)
 		nAb = (2*N13) + N12 + N23 + (( (pAb*paB) / ((pAb*paB) + (pAB*pab)) ) * N22)
 		naB = (2*N31) + N21 + N32 + (( (pAb*paB) / ((pAb*paB) + (pAB*pab)) ) * N22)

 		pAB = nAB / (2.0 * N)
 		pab = nab / (2.0 * N)
 		pAb = nAb / (2.0 * N)
 		paB = naB / (2.0 * N)

 	ret["haplotypes"] = [
 		(SNP_1_allele_1 + SNP_2_allele_1, pAB, freq_A_B),
 		(SNP_1_allele_1 + SNP_2_allele_2, pAb, freq_A_b),
 		(SNP_1_allele_2 + SNP_2_allele_1, paB, freq_a_B),
 		(SNP_1_allele_2 + SNP_2_allele_2, pab, freq_a_b),
 		]

 	Observed = [pAB, pAb, paB, pab]
 	Expected = [freq_A_B, freq_A_b, freq_a_B, freq_a_b]

 	R_sq = sum([((o-e)*(o-e))/e for o,e in zip(Observed, Expected)])
 	ret["R_sq"] = R_sq

 	pA = pAB + pAb
 	pB = pAB + paB

 	D = pAB - (pA*pB)

 	if D>=0:
 		Dmax = min(pA * (1-pB), (1-pA)*pB)
 	else:
 		Dmax = min(pA * pB, (1-pA)*(1-pB))

 	Dprime = D / Dmax
 	ret["Dprime"] = Dprime

 	return ret

 def find_snp_index_from_data(snp_id, data):

 	# THIS IS SLOW!!! 
 	# We can improve it!
 	for index, d in enumerate(cases):
 		if d['s_id'] == snp_id:
 			return index

 	return None

 def ld(cases, controls, snp_1, snp_2, output):

 	print ('SNP 1:', snp_1)
 	print ('SNP 2:', snp_2)

 	output_filename = output + '.ld'
 	print ('Saving LD to:', output_filename)

 	snp_1_index = find_snp_index_from_data(snp_1, cases)
 	snp_2_index = find_snp_index_from_data(snp_2, cases)

 	snp_1_data = cases[snp_1_index]['g'] + controls[snp_1_index]['g']
 	snp_2_data = cases[snp_2_index]['g'] + controls[snp_2_index]['g']

 	ld_result = Pairwise_linkage_disequilibrium(snp_1_data, snp_2_data)

 	with open(output_filename, 'w') as f:
 		to_print = [snp_1, snp_2, 
 			ld_result['R_sq'],
 			ld_result['Dprime'],
 		] + ["{}:{},{}".format(x[0], x[1], x[2]) for x in ld_result['haplotypes']]

 		f.write(' '.join(map(str, to_print)) + '\n')


 def genetic_association_test(SNPs, phenotypes, tests):
 	'''
 	For testing make sure that this method returns the same values as plink 
 	>>> SNPs = [('A','A'), ('A','G'), ('G','G'), ('G','A'), ('G', 'A'), ('A', 'G'), ('G', 'G'), ('A', 'A'), ('G', 'A'), ('A', 'A'), ('A','A'), ('A','G'), ('G','G'), ('G','A'), ('G', 'A'), ('A', 'G'), ('G', 'G'), ('A', 'A'), ('G', 'A'), ('A', 'A')]

 	>>> phenotypes = [1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1]

 	>>> tests = ['GENO', 'TREND', 'ALLELIC', 'DOM', 'REC'] # All of them
 	
 	>>> results = genetic_association_test(SNPs, phenotypes, tests)

 	>>> ' '.join([' '.join(['%0.4f' % results[t]['CHISQ'], '%0.4f' % results[t]['P']]) for t in tests])
 	'0.2176 0.6409 0.2198 0.6392 0.5128 0.7738 0.0105 0.9185 0.4945 0.4819'
 	'''

 	
 	#Assert that phenotypes and genotypes have the same length
 	assert len(SNPs) == len(phenotypes)

 	#Get the two different alleles of this SNP
 	allele_1, allele_2 = SNP_alleles(SNPs)

 	#Genotype Contingency table:
 	n = np.zeros((2 + 1, 3 + 1))

 	#Allelic Contingency table:
 	m = np.zeros((2 + 1, 2 + 1))

 	#Expected values for Genotype Contingency table:
 	En = np.zeros((2 + 1, 3 + 1))

 	#Expected values for Allelic Contingency table
 	Em = np.zeros((2 + 1, 2 + 1))

 	#Total values for Genotype Contingency table
 	nS3 = np.zeros(3 + 1)
 	n2S = np.zeros(2 + 1)

 	#Total values for Allelic Contingency table
 	m2S = np.zeros(2 + 1)
 	mS2 = np.zeros(2 + 1)

 	samples = float(len(SNPs))

 	#Build Contingency tables
 	for SNP, phenotype in zip(SNPs, phenotypes):

 		count_allele_1 = SNP.count(allele_1)
 		count_allele_2 = SNP.count(allele_2)

 		allele_1_1 = 1 if SNP == (allele_1, allele_1) else 0
 		allele_1_2 = 1 if allele_1 in SNP and allele_2 in SNP else 0
 		allele_2_2 = 1 if SNP == (allele_2, allele_2) else 0

 		mS2[1] += count_allele_1
 		mS2[2] += count_allele_2

 		m2S[phenotype] += 1

 		m[phenotype][1] += count_allele_1
 		m[phenotype][2] += count_allele_2

 		nS3[1] += allele_1_1
 		nS3[2] += allele_1_2
 		nS3[3] += allele_2_2

 		n2S[phenotype] += 1

 		n[phenotype][1] += allele_1_1
 		n[phenotype][2] += allele_1_2
 		n[phenotype][3] += allele_2_2

 	results = {}

 	cochran_armitrage_tests = ['TREND', 'DOM', 'REC']

 	for test in tests:

 		if test == 'ALLELIC':
 			chi_square = 0.0

 			for i in range(1,3):
 				for j in range(1,4):
 					En[i][j] = (n2S[i] * nS3[j]) / samples
 	
 					if En[i][j] > 0.0:
 						chi_square += ((n[i][j] - En[i][j]) ** 2) / En[i][j]

 			p_value = stats.chisqprob(chi_square, 2)

 		elif test == 'GENO':
 			chi_square = 0.0

 			for i in range(1,3):
 				for j in range(1,3):
 					Em[i][j] = (m2S[i] * mS2[j]) / samples

 					if Em[i][j] > 0.0:
 						chi_square += ((m[i][j] - Em[i][j]) ** 2) / Em[i][j]

 			p_value = stats.chisqprob(chi_square, 1)

 		elif test in cochran_armitrage_tests:
 			w = [[0,0,1,2], [0,0,1,1], [0,0,0,1]][cochran_armitrage_tests.index(test)]

 			#Nominator
 			nom = 0.0
 			for i in range(1,4):
 				nom += w[i] * (n[1][i] * n2S[2] - n[2][i] * n2S[1])
 			nom = nom ** 2

 			#First part of denominator
 			den1 = 0.0
 			for i in range(1,4):
 				den1 += (w[i]**2) * nS3[i] * (samples - nS3[i])

 			#Second part of denominator
 			den2 = 0.0
 			for i in range(1,3):
 				for j in range(i+1,4):
 					den2 += w[i] * w[j] * nS3[i] * nS3[j]
 			den2 = 2.0 * den2

 			#Third part of denominator
 			den3 = (n2S[1] * n2S[2]) / samples

 			#The denominator
 			den = den3 * (den1 - den2)

 			chi_square = nom / den

 			p_value = stats.chisqprob(chi_square, 1)

 		else:
 			raise Exception('Unknown test name: "%s"' % (str(test)))

 		
 		results[test] = {'CHISQ': chi_square, 'P' : p_value}

 	return results

 def association_test(cases, controls, output, test_names):

 	output_filename = output + '.association'
 	print ('\nSaving significance testing to file:', output_filename)

 	if not test_names:
 		tests = ['GENO']
 	else:
 		tests = list(set(test_names))
 	print ('Performing the following tests: ', ' '.join(tests))

 	with open(output_filename, 'w') as f:

 		header = ['snp_id', 'location']
 		for test_name in tests:
 			header.extend([test_name + '_pvalue', test_name + '_chisquare'])

 		#Write header
 		f.write(' '.join(header) + '\n')

 		bar = progressbar.ProgressBar(max_value=len(cases)+1)
 		line_counter = 0
 		for case, control in zip(cases, controls):

 			line_counter += 1
 			if line_counter % 1000 == 0:
 				bar.update(line_counter)

 			genotypes = case['g'] + control['g']
 			phenotypes = [2] * len(case['g']) + [1] * len(control['g'])

 			test_result = genetic_association_test(genotypes, phenotypes, tests)
 			to_print = [case['s_id'], case['l']]
 			for test_name in tests:
 				to_print.extend([test_result[test_name]['P'], test_result[test_name]['CHISQ']])

 			f.write(' '.join(map(str, to_print)) + '\n')

 def plot_manhattan(location, p_value, output_filename, title='Manhattan plot'):

 	plt.plot(location, -np.log(p_value), '.', markersize=5, color='black')

 	plt.xlabel('Location')
 	plt.ylabel('-log(p)')

 	plt.title(title)

 	#Plot significance threshold as a red line
 	plt.plot([location[0], location[-1]], [6, 6], color='red')

 	plt.xlim(location[0], location[-1])

 	png = output_filename + '.png'
 	print ('\nSaving to:', png)
 	plt.savefig(png)

 	plt.show()

 def ppoints(n):
 	'''
 	Same as R's ppoints 
 	http://stat.ethz.ch/R-manual/R-patched/library/stats/html/ppoints.html
 	Also check: http://en.wikipedia.org/wiki/Q%E2%80%93Q_plot#Heuristics
 	'''
 	if n <= 10:
 		a = 3.0/8.0
 	else:
 		a = 0.5

 	return (np.array(range(1,n+1)) - a) / (n + 1 - 2.0*a)

 def plot_qqplot(p_values, output_filename, title='QQ plot'):
 	'''
 	Create a qq-plot for p values from significance testing.  
 	'''
 	observed = np.sort(p_values)
 	observed = -np.log10(observed)
 	expected = -np.log10(ppoints(len(observed)))
 	x_label = 'expected -log(10)'
 	y_label = 'observed -log(10)'
 	
 	l = np.median(observed)/-np.log10(0.5)
 	
 	xmax = max(expected[np.invert(np.isinf(expected))]) + 1
 	ymax = max(observed[np.invert(np.isinf(observed))]) + 1
 	xymax = max(xmax, ymax)
 	
 	fig, ax = plt.subplots()
 	
 	#Set labels
 	ax.set_xlabel(x_label)
 	ax.set_ylabel(y_label)
 	ax.set_title(title)
 	
 	#plot ab line
 	ax.plot([-0.2, xymax], [-0.2, xymax], color='red', linewidth=2.0)
 	
 	#Plot qq plot
 	ax.plot(expected, observed, '.')

 	png = output_filename + '.png'
 	print ('\nSaving plot to:', png)
 	plt.savefig(png)

 	plt.show()

 def read_pvalues(filename, test_name):

 	locations = []
 	pvalues = []

 	with open(filename) as f:
 		header = f.readline().replace('\n', '').split()
 		p_column_name = test_name + '_pvalue'

 		if not p_column_name in header:
 			raise Exception('Column {} does not exit in file: {}'.format(p_column_name, filename))
 		p_column_index = header.index(p_column_name)

 		location_index = header.index('location')

 		bar = progressbar.ProgressBar(max_value=progressbar.UnknownLength)
 		line_counter = 0
 		for line in f:
 			line_counter += 1

 			if line_counter % 10000 == 0:
 				bar.update(line_counter)

 			line_splitted = line.replace('\n', '').split()
 			location = int(line_splitted[location_index])
 			p_value = float(line_splitted[p_column_index])

 			locations.append(location)
 			pvalues.append(p_value)

 	return locations, pvalues

 def manhattan(options, output):
 	manhattan_filename = options[0]
 	manhattan_test = options[1]

 	locations, pvalues = read_pvalues(manhattan_filename, manhattan_test)

 	plot_manhattan(locations, pvalues, output + '.'+ manhattan_test + '.manhattan',
 		title='{} Manhattan plot'.format(manhattan_test))

 def qqplot(options, output):
 	qq_filename = options[0]
 	qq_test = options[1]

 	locations, pvalues = read_pvalues(qq_filename, qq_test)

 	plot_qqplot(pvalues,
 		output_filename = output + '.' + qq_test + '.qqplot', 
 		title = '{} QQ plot'.format(qq_test))

 def parse_vep_response(response):
 	ret = {}
 	for r in response:
 		if 'transcript_consequences' in r:
 			for transcript_consequence in r['transcript_consequences']:
 				transcript_id = transcript_consequence['transcript_id']
 				ret[transcript_id] = {}
 				if 'polyphen_prediction' in transcript_consequence:
 					ret[transcript_id]['polyphen_prediction'] = transcript_consequence['polyphen_prediction']

 				if 'sift_prediction' in transcript_consequence:
 					ret[transcript_id]['sift_prediction'] = transcript_consequence['sift_prediction']


 	return ret

 def hgvs_formatter(chromosome, location, reference, alternative):
 	return "{}:g.{}{}>{}".format(chromosome, location, reference, alternative)

 def get_info_hgvs(hgvs_l):
 	'''
 	POST request
 	'''

 	url = 'http://rest.ensembl.org/vep/human/hgvs'
 	headers = { "Content-Type" : "application/json", "Accept" : "application/json"}

 	data = { "hgvs_notations" : hgvs_l}
 	data_json = json.dumps(data)
 	r = requests.post(url, headers=headers, data=data_json)

 	if not r.ok:
 		return None

 	data = r.json()
 	return parse_vep_response(data)

 def get_info_rs(rs_variant_l):
 	'''
 	POST request
 	'''
 	url = 'http://rest.ensembl.org/vep/human/id'
 	headers = { "Content-Type" : "application/json", "Accept" : "application/json"} 
 	data = { "ids" : rs_variant_l }
 	data_json = json.dumps(data)

 	r = requests.post(url, headers=headers, data=data_json)

 	if not r.ok:
 		return None

 	data = r.json()
 	#print (data)
 	return parse_vep_response(data)	

 def print_get_info_results(get_info_results):
 	'''
 	Nicely pring get_info results
 	'''

 	print (json.dumps(get_info_results, indent=4))

 def is_rs(variant):
 	return re.match(r'rs[\d]+', variant)

 def get_info(cases, controls, variants_index):

 	ret = {}

 	total = len(variants_index)

 	bar = progressbar.ProgressBar(max_value=len(variants_index))
 	for variant_counter, variant_index in enumerate(variants_index):

 		bar.update(variant_counter)

 		#Check which variants are rs and which are hgvs
 		rs_id = cases[variant_index]['r_id']
 		#print ('Running get_info for: {}  {}/{}'.format(rs_id, variant_counter+1, total))
 		if is_rs(rs_id):
 			get_info_results = get_info_rs([rs_id])
 		else:
 			s = re.match(r'([\d]+)-([\d]+)', rs_id)
 			if not s:
 				raise Exception('Could not parse rs_id: {}'.format(rs_id))
 			chromosome = s.group(1)
 			location = int(s.group(2))
 			liftovered_location = do_liftover(chromosome, location)
 			reference = cases[variant_index]['r']
 			alternative = cases[variant_index]['a'] 
 			this_hgvs = hgvs_formatter(chromosome, liftovered_location, reference, alternative)
 			get_info_results = get_info_hgvs([this_hgvs])

 		#Check returned data
 		if get_info_results == {}:
 			get_info_results = 'WARNING!!!! get_info did not return consequences information for {}'.format(rs_id)
 		
 		elif get_info_results is None:
 			get_info_results = 'WARNING!!!! get_info failed for variant: {}'.format(rs_id)

 		else:
 			ret[variant_index] = get_info_results

 		#print_get_info_results(results)

 	return ret

 def do_liftover(chromosome, position):
 	if not lo:
 		return position

 	lo_result = lo.convert_coordinate('chr{}'.format(chromosome), position)
 	if not lo_result:
 		print ('WARNING!! Could not liftover: {}:{}'.format(chromosome, position))
 		return None

 	if len(lo_result) > 1:
 		# Is this possible???
 		print ('WARNING! Liftover returned more than one positions for {}:{}'.format(chromosome, position))

 	return lo_result[0][1]

 def setup_liftover(from_assembly, to_assembly):
 	global lo

 	print ('Setting up liftover..')
 	lo = LiftOver(from_assembly, to_assembly)
 	print ('...Done')

 def plot_distribution(data, bins=50, xlabel='', ylabel='', title='', output_filename=None):
 	'''
 	http://matplotlib.org/1.2.1/examples/pylab_examples/histogram_demo.html
 	'''

 	n, bins, patches = plt.hist(data, bins)

 	plt.xlabel(xlabel)
 	plt.ylabel(ylabel)
 	plt.title(title)

 	if output_filename:
 		png = output_filename + '.png'
 		print ('Saving to ', png)
 		plt.savefig(png)

 	plt.show()

 def maf_from_frequency_file(filename):
 	frequencies = []

 	print ('Reading file:', filename)
 	with open(filename) as f:
 		for line in f:
 			line_splitted = line.replace('\n', '').split()

 			fr_ref = float(line_splitted[-2])
 			fr_alt = float(line_splitted[-1])

 			maf = min(fr_ref, fr_alt)
 			frequencies.append(maf)

 	return frequencies

 def p_from_hwe_file(filename):

 	print ('Reading file:', filename)
 	all_p = []

 	with open(filename) as f:
 		for line in f:
 			line_splitted = line.replace('\n', '').split()
 			p = float(line_splitted[-1])
 			all_p.append(p)

 	return all_p


 def allele_frequency_plot(filename):
 	'''
 	Plots the distribution of an allele frequency file
 	'''

 	frequencies = maf_from_frequency_file(filename)

 	plot_distribution(frequencies, 
 		bins=100, 
 		xlabel='Minor Allele Frequencies', 
 		ylabel='Counts',
 		output_filename = filename)

 def hwe_plot(filename):

 	all_p = p_from_hwe_file(filename)

 	plot_distribution(all_p,
 		bins=100,
 		xlabel='HWE p-values',
 		ylabel='Counts',
 		output_filename=filename)

 def maf_filter(cases, controls, output, maf_filter):
 	print ('\nFiltering with MAF>{}'.format(maf_filter))

 	print ('Calculate frequencies..')
 	frequencies_filename = allele_frequency(cases, controls, output)

 	print ('Read file with frequencies..')
 	frequencies = maf_from_frequency_file(frequencies_filename)

 	print ('Apply filter..')
 	removed = 0
 	total = 0
 	with open('remove_snps.tmp', 'w') as f:
 		for case, frequency in zip(cases, frequencies):
 			total += 1
 			if frequency<maf_filter:
 				removed += 1
 				f.write(case['s_id'] + '\n')
 	print ('Removed: {} out from {} SNPs'.format(removed, total))

 	print ('Create new data..')
 	remove_snps(cases, controls, output, 'remove_snps.tmp')

 def hwe_filter(cases, controls, output, hwe_filter, test):
 	print ('\nFiltering with HWE>{}'.format(hwe_filter))

 	print ('Calculate hwe..')
 	hwe_filename = hwe(cases, controls, output, test)

 	print ('\nReading file with p-values: {}'.format(hwe_filename))
 	all_p = p_from_hwe_file(hwe_filename)

 	print ('Apply filter..')
 	removed = 0
 	total = 0
 	with open('remove_snps.tmp', 'w') as f:
 		for case, p in zip(cases, all_p):
 			total += 1
 			if p < hwe_filter:
 				removed += 1
 				f.write(case['s_id'] + '\n')

 	print ('Removed: {} out from {} SNPs'.format(removed, total))

 	print ('Create new data..')
 	remove_snps(cases, controls, output, 'remove_snps.tmp')

 def p_value_generator(filename, test_name):
 	'''
 	This is a generator: https://wiki.python.org/moin/Generators 
 	'''

 	print ('\nOpening file:', filename)

 	with open(filename) as f:
 		header = f.readline().replace('\n', '').split()
 		p_column_name = test_name + '_pvalue'
 		if not p_column_name in header:
 			raise Exception('{} column does not exist in {}'.format(p_column_name, filename))
 		p_column_index = header.index(p_column_name)

 		line_counter = 0
 		for line in f:
 			line_counter += 1

 			if line_counter % 10000 == 0:
 				pass
 				#print ('Line counter: ', line_counter)

 			line_splitted = line.replace('\n', '').split()
 			p_value = float(line_splitted[p_column_index])
 			#yield line_counter, p_value
 			yield p_value

 def find_peaks(data, window_size, significance_level):
 	peaks = []

 	for i in range(0, len(data), window_size):
 		window = -np.log(data[i : i+window_size])

 		window_index = i + int(np.argmax(window)) # The int() part is to get over with this issue: https://bugs.python.org/issue24313 
 		                                          # Yep.. even python has its quirks.. 
 		window_max = np.max(window)
 		if window_max < significance_level:
 			continue # This is not a peak

 		#There is an interesting peak. # Keep it!
 		peaks.append({'index': window_index, 'max': window_max})

 	return peaks

 def get_data_point(cases, controls, index):
 	return cases[index]['g'] + controls[index]['g']

 def disequilibrium_criterion(ld_result, dprime_threshod=0.98, r_sq_threshold=0.2):
 	'''
 	http://www.nature.com/nrg/journal/v4/n8/full/nrg1123.html
 	'''
 	if np.abs(ld_result['Dprime']) > dprime_threshod:
 		return True

 	if ld_result['R_sq'] > r_sq_threshold:
 		return True

 	return False

 def get_independent_peaks(cases, controls, peaks):

 	iterations = 0
 	while True:
 		iterations += 1
 		new_peaks = []

 		change_happened = False

 		for peak_index in range(0, len(peaks), 2):
 			peak_1 = peaks[peak_index]

 			if peak_index+1 == len(peaks):
 				continue

 			peak_2 = peaks[peak_index+1]

 			distance = peak_2['index'] - peak_1['index']
 			genotypes_1 = get_data_point(cases, controls, peak_1['index'])
 			genotypes_2 = get_data_point(cases, controls, peak_2['index'])

 			ld_result = Pairwise_linkage_disequilibrium(genotypes_1, genotypes_2)

 			are_in_LD = disequilibrium_criterion(ld_result)
 			if are_in_LD:
 				new_peaks.append(peak_1)
 				change_happened = True
 			else:
 				new_peaks.append(peak_1)
 				new_peaks.append(peak_2)

 		print ('Iteration: {}. Peaks: {}'.format(iterations, len(new_peaks)))
 		peaks = new_peaks
 		if not change_happened:
 			# Our work here is done
 			break

 	
 	return peaks

 def regional_ld_snps(cases, control, index):

 	ret = []

 	peak_genotypes = get_data_point(cases, controls, index)

 	for direction in [-1, +1]:
 		#print ('Direction: {}'.format({-1:'-', 1:'+'}[direction]))
 		compare_with = index + direction
 		last_snp_in_ld = index
 		while True:
 			compare_with_genotype = get_data_point(cases, controls, compare_with)
 			ld_result = Pairwise_linkage_disequilibrium(peak_genotypes, compare_with_genotype)
 			if disequilibrium_criterion(ld_result):
 				#print ('{} - {} LD!'.format(peak_counter, compare_with))
 				ret.append(compare_with)
 				last_snp_in_ld = compare_with
 			else:
 				#What is the distance between this and the last distance in LD?
 				distance = np.abs(compare_with - last_snp_in_ld)
 				#print ('{} - {} NOT LD. DISTANCE: {}'.format(peak_counter, compare_with, distance))
 				if distance > 100: # 1000 returned the same results.
 					break

 			compare_with += direction # move along

 	return ret


 def independent(cases, controls, output, independent_arg):
 	'''
 	This function takes as input the result if a statistical test analysis
 	and locates the independent signals
 	'''

 	filename = independent_arg[0]
 	test_name = independent_arg[1]

 	p_values = list(p_value_generator(filename, test_name))

 	peaks = find_peaks(p_values, 100, 8)
 	print ('Found {} potential interesting significance peaks!'.format(len(peaks)))

 	ind_peaks = get_independent_peaks(cases, controls, peaks)
 	print ('Independent peaks: {}'.format(len(ind_peaks)))

 	peak_lds = {}
 	for peak_counter, peak in enumerate(ind_peaks):
 		print ('Ivestigating peak: {}'.format(peak_counter+1))
 		peak_index = peak['index']

 		peak_lds[peak_index] = regional_ld_snps(cases, controls, peak_index)
 		print ('Peak {}. Found {} SNPs in LD'.format(peak_counter, len(peak_lds[peak_index])))

 	output_filename = output + '.independent'
 	print ('Saving results to {}'.format(output_filename))
 	with open(output_filename, 'w') as f:
 		json.dump(peak_lds, f, indent=4)

 def get_info_independent(cases, controls, output, input_filename):
 	print ('Reading:', input_filename)

 	to_save = {}
 	with open(input_filename) as f:
 		peak_lds = json.load(f)
 		peak_lds = {int(k):v for k,v in peak_lds.items()} # JSON stores keys as strings..

 	for peak_counter, peak in enumerate(peak_lds):
 		print ('\nGetting info for peak: {}/{}'.format(peak_counter+1, len(peak_lds)))
 		peak_id = cases[peak]['r_id']
 		peak_location = cases[peak]['l']
 		peak_info = get_info(cases, controls, peak_lds[peak])

 		to_save[peak_id] = {
 			'location': peak_location,
 			'info': peak_info,
 		}

 	output_filename = output + '.get_info'
 	print ('\nSaving results to:', output_filename)
 	with open(output_filename, 'w') as f:
 		json.dump(to_save, f, indent=4)


 if __name__ == '__main__':
 	'''
 	
 	Example default run
 	/Users/alexandroskanterakis/anaconda3/bin/python project.py --controls_file gwas.controls.gen --cases_file gwas.cases.gen --output output

 	/Users/alexandroskanterakis/anaconda3/bin/python project.py --controls_file gwas.controls.gen --cases_file gwas.cases.gen --output output --keep_snps example.keep.snps
 	/Users/alexandroskanterakis/anaconda3/bin/python project.py --controls_file gwas.controls.gen --cases_file gwas.cases.gen --output output --remove_snps example.keep.snp

 	/Users/alexandroskanterakis/anaconda3/bin/python project.py --controls_file gwas.controls.gen --cases_file gwas.cases.gen --output output --keep_samples example.keep.samples

 	/Users/alexandroskanterakis/anaconda3/bin/python project.py --controls_file gwas.controls.gen --cases_file gwas.cases.gen --output output --remove_samples example.keep.samples

 	time /Users/alexandroskanterakis/anaconda3/bin/python project.py --controls_file gwas.controls.gen --cases_file gwas.cases.gen --output output --allele_frequency
 	time /Users/alexandroskanterakis/anaconda3/bin/python project.py --allele_frequency_plot output.frequency
 	time /Users/alexandroskanterakis/anaconda3/bin/python project.py --controls_file gwas.controls.gen --cases_file gwas.cases.gen --output output_maf005 --maf_filter 0.05

 	time /Users/alexandroskanterakis/anaconda3/bin/python project.py --controls_file gwas.controls.gen --cases_file gwas.cases.gen --output output --hwe_asymptotic
 	time /Users/alexandroskanterakis/anaconda3/bin/python project.py --controls_file gwas.controls.gen --cases_file gwas.cases.gen --output output --hwe_exact
 	time /Users/alexandroskanterakis/anaconda3/bin/python project.py --hwe_plot output_asymptotic.hwe_asymptotic 
 	time /Users/alexandroskanterakis/anaconda3/bin/python project.py --controls_file gwas.controls.gen --cases_file gwas.cases.gen --output output_hwe_asymptotic_0001 --hwe_asymptotic_filter 0.05
 	time /Users/alexandroskanterakis/anaconda3/bin/python project.py --controls_file gwas.controls.gen --cases_file gwas.cases.gen --output output_hwe_exact_0001 --hwe_exact_filter 0.05

 	time /Users/alexandroskanterakis/anaconda3/bin/python project.py --controls_file gwas.controls.gen --cases_file gwas.cases.gen --output output --ld snp_5 snp_7

 	time /Users/alexandroskanterakis/anaconda3/bin/python project.py --controls_file gwas.controls.gen --cases_file gwas.cases.gen --output output --association_test 

 	time /Users/alexandroskanterakis/anaconda3/bin/python project.py --controls_file gwas.controls.gen --cases_file gwas.cases.gen --output output --manhattan output.association GENO  

 	time /Users/alexandroskanterakis/anaconda3/bin/python project.py --controls_file gwas.controls.gen --cases_file gwas.cases.gen --output output --qqplot output.association GENO  

 	time /Users/alexandroskanterakis/anaconda3/bin/python project.py --controls_file gwas.controls.gen --cases_file gwas.cases.gen --output output --liftover hg18 hg38 --get_info rs4814683 

 	time /Users/alexandroskanterakis/anaconda3/bin/python project.py --controls_file gwas.controls.gen --cases_file gwas.cases.gen --output output --liftover hg18 hg38 --get_info snp_1 
 	'''

 	argparser = argparse.ArgumentParser(description='Python project https://gist.github.com/kantale/24f4f02d5b87ce0146e0455791e71aeb ')
 	argparser.add_argument('--controls_file', default=None, required = False)
 	argparser.add_argument('--cases_file', default=None, required=False)
 	argparser.add_argument('--output')
 	argparser.add_argument('--keep_snps', default=None, required = False)
 	argparser.add_argument('--remove_snps', default=None, required = False)
 	argparser.add_argument('--keep_samples', default=None, required=False)
 	argparser.add_argument('--remove_samples', default=None, required=False)
 	argparser.add_argument('--allele_frequency', action='store_true', required=False)
 	argparser.add_argument('--allele_frequency_plot', required=False)
 	argparser.add_argument('--maf_filter', default=None, type=float, required=False)
 	argparser.add_argument('--hwe_asymptotic', action='store_true', required=False)
 	argparser.add_argument('--hwe_exact', action='store_true', required=False)
 	argparser.add_argument('--hwe_plot', default=None, required=False)
 	argparser.add_argument('--hwe_asymptotic_filter', default=None, type=float, required=False)
 	argparser.add_argument('--hwe_exact_filter', default=None, type=float, required=False)
 	argparser.add_argument('--ld', nargs=2, required=False)
 	argparser.add_argument('--association_test', nargs='*', required=False)
 	argparser.add_argument('--manhattan', nargs=2, required=False)
 	argparser.add_argument('--qqplot', nargs=2, required=False)
 	argparser.add_argument('--get_info', default=None, required=False)
 	argparser.add_argument('--liftover', nargs=2, required=False)
 	argparser.add_argument('--independent', nargs=2, default=None, required=False)
 	argparser.add_argument('--get_info_independent', default=None, required=False)
 	options = argparser.parse_args()

 	cases = None
 	controls = None
 	output = options.output

 	if not options.controls_file is None and not options.cases_file is None:
 		cases, controls = read_input(options)

 	#Setup liftover object
 	if options.liftover:
 		print ('\nSetting up liftover object from:{} to:{}'.format(*options.liftover))
 		setup_liftover(options.liftover[0], options.liftover[1])

 	if not options.keep_snps is None:
 		keep_snps(cases, controls, output, options.keep_snps)

 	if not options.remove_snps is None:
 		remove_snps(cases, controls, output, options.remove_snps)

 	if not options.keep_samples is None:
 		keep_samples(cases, controls, output, options.keep_samples)

 	if not options.remove_samples is None:
 		remove_samples(cases, controls, output, options.remove_samples)

 	if options.allele_frequency:
 		allele_frequency(cases, controls, output)

 	if options.allele_frequency_plot:
 		allele_frequency_plot(options.allele_frequency_plot)

 	if not options.maf_filter is None:
 		maf_filter(cases, controls, output, options.maf_filter)

 	if options.hwe_asymptotic:
 		hwe(cases, controls, output, 'asymptotic')

 	if options.hwe_exact:
 		hwe(cases, controls, output, 'exact')

 	if options.hwe_plot:
 		hwe_plot(options.hwe_plot)

 	if not options.hwe_asymptotic_filter is None:
 		hwe_filter(cases, controls, output, options.hwe_asymptotic_filter, 'asymptotic')

 	if not options.hwe_exact_filter is None:
 		hwe_filter(cases, controls, output, options.hwe_exact_filter, 'exact')

 	if options.ld:
 		ld(cases, controls, options.ld[0], options.ld[1], output)

 	if not options.association_test is None:
 		association_test(cases, controls, output, options.association_test)

 	if not options.manhattan is None:
 		manhattan(options.manhattan, output)

 	if not options.qqplot is None:
 		qqplot(options.qqplot, output)

 	if not options.get_info is None:
 		get_info(cases, controls, [options.get_info])

 	if not options.independent is None:
 		independent(cases, controls, output, options.independent)

 	if not options.get_info_independent is None:
 		get_info_independent(cases, controls, output, options.get_info_independent)

diff --git a/project_report.ipynb b/project_report.ipynb
No results found