Skip to content

Instantly share code, notes, and snippets.

@kantale
Last active November 13, 2017 13:10
Show Gist options
  • Select an option

  • Save kantale/87b4e1b24abf9be85e44fcd7f8324639 to your computer and use it in GitHub Desktop.

Select an option

Save kantale/87b4e1b24abf9be85e44fcd7f8324639 to your computer and use it in GitHub Desktop.
'''
Copyright (c) 2017, Alexandros Kanterakis kantale@ics.forth.gr
All rights reserved.
Simplified BSD License:
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
'''
import re
import os
import json
import time
import numpy as np
import argparse
import requests
import progressbar #
from pyliftover import LiftOver
import matplotlib.pyplot as plt
from scipy import stats
# global liftover variable
lo = None
def print_now():
'''
prints the current time.
useful for progress information
'''
return time.strftime("%a, %d %b %Y %H:%M:%S", time.gmtime())
def load(filename):
print ('Opening file:', filename)
data_to_return = []
with open(filename) as f:
line_counter = 0
bar = progressbar.ProgressBar(max_value=progressbar.UnknownLength)
for line in f:
line_counter += 1
# if line_counter > 10000:
# break
if line_counter % 1000 == 0:
bar.update(line_counter)
line_splitted = line.replace('\n', '').split()
snp_id = line_splitted[0]
rs_id = line_splitted[1]
location = int(line_splitted[2])
reference = line_splitted[3]
alternative = line_splitted[4]
this_variant_hash = {
('1','0','0'): (reference, reference),
('0','1','0'): (reference, alternative),
('0','0','1'): (alternative, alternative)
}
numeric_genotypes = line_splitted[5:]
#Xwrizoume to all_genotypes se triades kai metatrepoume ta (0,0,1) se (reference, reference), ktl
allelic_genotypes = [this_variant_hash[(numeric_genotypes[i], numeric_genotypes[i+1], numeric_genotypes[i+2])] for i in range(0, len(numeric_genotypes), 3)]
this_variant = {
's_id': snp_id,
'r_id': rs_id,
'l': location,
'r': reference,
'a': alternative,
'g': allelic_genotypes,
}
data_to_return.append(this_variant)
return data_to_return
def save(data, filename):
'''
Save data
'''
print ('\nSaving data to:', filename)
with open(filename, 'w') as f:
line_counter = 0
bar = progressbar.ProgressBar(max_value=len(data)+1)
for d in data:
line_counter += 1
if line_counter % 1000 == 0:
bar.update(line_counter)
line = [
d['s_id'], d['r_id'], str(d['l']), d['r'], d['a']
]
this_variant_hash = {
(d['r'], d['r']) : ['1', '0', '0'],
(d['r'], d['a']) : ['0', '1', '0'],
(d['a'], d['a']) : ['0', '0', '1'],
}
for alleleic_genotype in d['g']:
line.extend(this_variant_hash[alleleic_genotype])
f.write(' '.join(line) + '\n')
def read_input(options):
print ('\nLOADING CASES:')
cases = load(options.cases_file)
print ('\nLOADING CONTROLS:')
controls = load(options.controls_file)
return cases, controls
def keep_snps_data(data, keep_snps, output_filename):
data_new = [x for x in data if x['s_id'] in keep_snps]
save(data_new, output_filename)
def remove_snps_data(data, remove_snps, output_filename):
data_new = [x for x in data if not x['s_id'] in remove_snps]
save(data_new, output_filename)
def read_snp_list(filename):
with open(filename) as f:
snps = {x.replace('\n', ''): None for x in f.readlines()}
print ('Read {} SNPs from {}'.format(len(snps), filename))
return snps
def keep_snps(cases, controls, output, keep_snps_filename):
keep_snps = read_snp_list(keep_snps_filename)
keep_snps_data(cases, keep_snps, output + '.keep_snps.cases')
keep_snps_data(controls, keep_snps, output + '.keep_snps.controls')
def remove_snps(cases, controls, output, remove_snps_filename):
remove_snps = read_snp_list(remove_snps_filename)
remove_snps_data(cases, remove_snps, output + '.remove_snps.cases')
remove_snps_data(controls, remove_snps, output + '.remove_snps.controls')
def read_sample_list(filename):
sample_ids = {
'cases': [],
'controls': [],
}
with open(filename) as f:
for line in f:
sample = line.replace('\n', '')
sample_splitted = sample.split('_')
if len(sample_splitted) != 2:
raise Exception('Unknown sample format: {}'.format(sample))
cases_controls = sample_splitted[0]
if not cases_controls in ['case', 'control']:
raise Exception('First part of sample id sould be case or control')
cases_controls += 's' # control --> controls, case --> cases
try:
case_id = int(sample_splitted[1])
except NameError as e:
print ('Second part of sample id should be an integer')
raise e
sample_ids[cases_controls].append(case_id-1)
# Remove possible duplicate
for case_control in sample_ids:
sample_ids[case_control] = list(set(sample_ids[case_control]))
print ('Read {} case samples and {} control samples from {}'.format(len(sample_ids['cases']), len(sample_ids['controls']), filename))
return sample_ids
def keep_sample_data(data, samples_to_keep, output_filename):
for d in data:
d['g'] = [g for i, g in enumerate(d['g']) if i in samples_to_keep]
save(data, output_filename)
def keep_samples(cases, controls, output, keep_samples_filename):
samples = read_sample_list(keep_samples_filename)
keep_sample_data(cases, samples['cases'], output + '.keep_samples.cases')
keep_sample_data(controls, samples['controls'], output + '.keep_samples.controls')
def remove_sample_data(data, samples_to_remove, output_filename):
for d in data:
d['g'] = [g for i, g in enumerate(d['g']) if not i in samples_to_remove]
save(data, output_filename)
def remove_samples(cases, controls, output, remove_samples_filename):
samples = read_sample_list(remove_samples_filename)
remove_sample_data(cases, samples['cases'], output + '.remove_samples.cases')
remove_sample_data(controls, samples['controls'], output + '.remove_samples.controls')
def allele_frequency(cases, controls, output):
output_filename = output + '.frequency'
print ('\nSaving frequencies to:', output_filename)
with open(output_filename, 'w') as f:
line_counter = 0
bar = progressbar.ProgressBar(max_value=len(cases)+1)
for case, control in zip(cases, controls):
line_counter += 1
if line_counter % 1000 == 0:
bar.update(line_counter)
to_print = [case['s_id']]
reference = case['r']
alternative = case['a']
all_cases = len(case['g'])
all_controls = len(control['g'])
# H suxnothta tou reference sta controls
ref_controls = sum([genotype.count(reference) for genotype in control['g']])
#/ (2.0 * len(control['g']))
# H suxnothta tou alternative sta controls
alt_controls = sum([genotype.count(alternative) for genotype in control['g']])
#/ (2.0 * len(control['g']))
# H suxnothta tou reference sta cases
ref_cases = sum([genotype.count(reference) for genotype in case['g']])
#/ (2.0 * len(case['g']))
# H suxnothta tou alternative sta cases
alt_cases = sum([genotype.count(alternative) for genotype in case['g']])
#/ (2.0 * len(case['g']))
# H suxnotuta tou reference sta controls + cases
ref_all = ref_controls + ref_cases
# H suxnothta tou alternative sta controls + cases
alt_all = alt_controls + alt_cases
to_print.extend([
ref_controls / (2.0 * all_controls),
alt_controls / (2.0 * all_controls),
ref_cases / (2.0 * all_cases),
alt_cases / (2.0 * all_cases),
ref_all / (2.0 * (all_controls+all_cases)),
])
f.write(' '.join(map(str, to_print)) + '\n')
return output_filename
def Hardy_Weinberg_Equilibrium_asymptotic_test(obs_hets, obs_hom1, obs_hom2):
'''
Compute the Hardy-Weinberg Equilibrium of a SNP by using an asymptotic test.
This is equivalent with the value produced with the --hardy2 option of plink
software (http://pngu.mgh.harvard.edu/~purcell/plink/summary.shtml#hardy).
'''
xx = obs_hom1
xy = obs_hets
yy = obs_hom2
if xy == 0 and yy == 0:
return 0.0
if xx ==0 and xy == 0:
return 0.0
samples = xx + xy + yy
observed_heterozygosity = float(xy) / samples
alleleX_count = (2*xx) + xy
alleleX_ratio = float(alleleX_count) / (2 * samples)
alleleY_count = xy + (2*yy)
alleleY_ratio = float(alleleY_count) / (2 * samples)
expected_xx_n = alleleX_ratio * alleleX_ratio * samples
expected_xy_n = 2.0 * alleleX_ratio * alleleY_ratio * samples
expected_yy_n = alleleY_ratio * alleleY_ratio * samples
expected_total = expected_xx_n + expected_xy_n + expected_yy_n
expected_xx_ratio = float(expected_xx_n) / expected_total
expected_xy_ratio = float(expected_xy_n) / expected_total
expected_yy_ratio = float(expected_yy_n) / expected_total
chi_square_xx = ((xx - expected_xx_n) ** 2.0) / expected_xx_n
chi_square_xy = ((xy - expected_xy_n) ** 2.0) / expected_xy_n
chi_square_yy = ((yy - expected_yy_n) ** 2.0) / expected_yy_n
chi_square_total = chi_square_xx + chi_square_xy + chi_square_yy
return stats.chisqprob(chi_square_total, 1)
def Hardy_Weinberg_Equilibrium_exact_test(obs_hets, obs_hom1, obs_hom2):
'''
According to the comments:
This code implements an exact SNP test of Hardy-Weinberg Equilibrium as described in Wigginton, JE, Cutler, DJ, and Abecasis, GR (2005) A Note on Exact Tests of Hardy-Weinberg Equilibrium.
American Journal of Human Genetics. 76: 000 - 000
Written by Jan Wigginton
Converted to python from the C/C++ code from: http://www.sph.umich.edu/csg/abecasis/Exact/c_instruct.html from Alexandros Kanterakis (kantale@ics.forth.gr)
This test is used by plink when applying the --hardy option (http://pngu.mgh.harvard.edu/~purcell/plink/summary.shtml#hardy) and from Beagle utilities (http://faculty.washington.edu/browning/beagle_utilities/utilities.html) to compute Hardy-Weinberg Equilibrium.
'''
if obs_hom1 < 0 or obs_hom2 < 0 or obs_hets < 0:
raise Exception("FATAL ERROR - SNP-HWE: Current genotype configuration (%s %s %s) includes negative count" % (obs_hets, obs_hom1, obs_hom2))
obs_homc = obs_hom2 if obs_hom1 < obs_hom2 else obs_hom1
obs_homr = obs_hom1 if obs_hom1 < obs_hom2 else obs_hom2
rare_copies = 2 * obs_homr + obs_hets
genotypes = obs_hets + obs_homc + obs_homr
het_probs = [0.0] * (rare_copies + 1)
#start at midpoint
mid = rare_copies * (2 * genotypes - rare_copies) // (2 * genotypes)
#check to ensure that midpoint and rare alleles have same parity
if (rare_copies & 1) ^ (mid & 1):
mid += 1
curr_hets = mid
curr_homr = (rare_copies - mid) // 2
curr_homc = genotypes - curr_hets - curr_homr
het_probs[mid] = 1.0
sum = float(het_probs[mid])
for curr_hets in range(mid, 1, -2):
het_probs[curr_hets - 2] = het_probs[curr_hets] * curr_hets * (curr_hets - 1.0) / (4.0 * (curr_homr + 1.0) * (curr_homc + 1.0))
sum += het_probs[curr_hets - 2];
# 2 fewer heterozygotes for next iteration -> add one rare, one common homozygote
curr_homr += 1
curr_homc += 1
curr_hets = mid
curr_homr = (rare_copies - mid) // 2
curr_homc = genotypes - curr_hets - curr_homr
for curr_hets in range(mid, rare_copies - 1, 2):
het_probs[curr_hets + 2] = het_probs[curr_hets] * 4.0 * curr_homr * curr_homc / ((curr_hets + 2.0) * (curr_hets + 1.0))
sum += het_probs[curr_hets + 2]
#add 2 heterozygotes for next iteration -> subtract one rare, one common homozygote
curr_homr -= 1
curr_homc -= 1
for i in range(0, rare_copies + 1):
het_probs[i] /= sum
#alternate p-value calculation for p_hi/p_lo
p_hi = float(het_probs[obs_hets])
for i in range(obs_hets, rare_copies+1):
p_hi += het_probs[i]
p_lo = float(het_probs[obs_hets])
for i in range(obs_hets-1, -1, -1):
p_lo += het_probs[i]
p_hi_lo = 2.0 * p_hi if p_hi < p_lo else 2.0 * p_lo
p_hwe = 0.0
# p-value calculation for p_hwe
for i in range(0, rare_copies + 1):
if het_probs[i] > het_probs[obs_hets]:
continue;
p_hwe += het_probs[i]
p_hwe = 1.0 if p_hwe > 1.0 else p_hwe
return p_hwe
def hwe(cases, controls, output, test_name):
output_filename = output + '.hwe_' + test_name
print ('\nSaving HWE to:', output_filename)
test_function = {
'asymptotic': Hardy_Weinberg_Equilibrium_asymptotic_test,
'exact': Hardy_Weinberg_Equilibrium_exact_test,
}[test_name]
with open(output_filename, 'w') as f:
line_counter = 0
bar = progressbar.ProgressBar(max_value=len(cases)+1)
for case, control in zip(cases, controls):
line_counter += 1
if line_counter % 1000 == 0:
bar.update(line_counter)
to_print = [case['s_id']]
homozygous_1 = (case['r'], case['r'])
homozygous_2 = (case['a'], case['a'])
heterozygous = (case['r'], case['a'])
all_genotypes = case['g'] + control['g']
obs_hets = all_genotypes.count(heterozygous)
obs_hom1 = all_genotypes.count(homozygous_1)
obs_hom2 = all_genotypes.count(homozygous_2)
assert obs_hets + obs_hom1 + obs_hom2 == len(all_genotypes)
p_value = test_function(obs_hets, obs_hom1, obs_hom2)
to_print.append(p_value)
f.write(' '.join(map(str, to_print)) + '\n')
return output_filename
def SNP_alleles(
SNP = None,
missing = "0",
):
'''
Get the two different alleles of a SNP. The parameter SNP is a list of tuples with genotypes.
For example if SNP = [(A,A), (A,G), (G,G), (G,A)] the function returns (A,G)
'''
allele_1 = None
for gen in SNP:
if gen[0] != missing:
if not allele_1:
allele_1 = gen[0]
elif gen[0] != allele_1:
return (allele_1, gen[0])
if gen[1] != missing:
if not allele_1:
allele_1 = gen[1]
elif gen[1] != allele_1:
return (allele_1, gen[1])
return (allele_1, allele_1)
def Pairwise_linkage_disequilibrium(
SNP_1 = None,
SNP_2 = None,
):
'''
Computes the linkage disequilibrium between two SNPs. Computes the r-squared, D' and the estimated haplotype frequencies.
It has been developed to produce the same values as plink with --ld option. (http://pngu.mgh.harvard.edu/~purcell/plink/ld.shtml)
Returns a dictionary with the following keys:
"haplotypes" : A tuple with three values: (1) the estimated haplotype, (2) the haplotype frequencies and (3) frequencies expectation under linkage equilibrium.
"R_sq" : The r-square.
"Dprime" : The D prime.
'''
def Minor_allele_frequency(
allele1=None,
allele2=None,
observations_chr1 = None,
observations_chr2 = None
):
"""
Calculates the Minor Allele Frequency of a SNP.
>>> Minor_allele_frequency("A", "C", ["A", "A", "C", "C", "C"], ["A", "C", "C", "C","C"])
(0.3, 'A', 10)
"""
allele1_chr1 = observations_chr1.count(allele1)
allele2_chr1 = observations_chr1.count(allele2)
if observations_chr2:
allele1_chr2 = observations_chr2.count(allele1)
allele2_chr2 = observations_chr2.count(allele2)
else:
allele1_chr2 = 0
allele2_chr2 = 0
observations = allele1_chr1 + allele2_chr1 + allele1_chr2 + allele2_chr2
allele1_maf = float((allele1_chr1 + allele1_chr2)) / observations
if allele1_maf < 0.5:
return (allele1_maf, allele1, observations)
else:
return (1.0 - allele1_maf, allele2, observations)
ret = {}
(SNP_1_allele_1, SNP_1_allele_2) = SNP_alleles(SNP_1)
(SNP_2_allele_1, SNP_2_allele_2) = SNP_alleles(SNP_2)
Haplotype_1_1 = [x[0] for x in SNP_1]
Haplotype_1_2 = [x[1] for x in SNP_1]
Haplotype_2_1 = [x[0] for x in SNP_2]
Haplotype_2_2 = [x[1] for x in SNP_2]
MAF_1 = Minor_allele_frequency(SNP_1_allele_1, SNP_1_allele_2, Haplotype_1_1, Haplotype_1_2)
MAF_2 = Minor_allele_frequency(SNP_2_allele_1, SNP_2_allele_2, Haplotype_2_1, Haplotype_2_2)
if MAF_1[1] == SNP_1_allele_1:
freq_1_1 = MAF_1[0]
freq_1_2 = 1.0 - freq_1_1
elif MAF_1[1] == SNP_1_allele_2:
freq_1_2 = MAF_1[0]
freq_1_1 = 1.0 - freq_1_2
if MAF_2[1] == SNP_2_allele_1:
freq_2_1 = MAF_2[0]
freq_2_2 = 1.0 - freq_2_1
elif MAF_2[1] == SNP_2_allele_2:
freq_2_2 = MAF_2[0]
freq_2_1 = 1.0 - freq_2_2
freq_A_B = freq_1_1 * freq_2_1
freq_A_b = freq_1_1 * freq_2_2
freq_a_B = freq_1_2 * freq_2_1
freq_a_b = freq_1_2 * freq_2_2
Haps = list(zip(Haplotype_1_1, Haplotype_1_2, Haplotype_2_1, Haplotype_2_2))
N11 = len([1 for h11, h12, h21, h22 in Haps if h11 == SNP_1_allele_1 and h12 == SNP_1_allele_1 and h21 == SNP_2_allele_1 and h22 == SNP_2_allele_1])
N21 = len([1 for h11, h12, h21, h22 in Haps if ((h11 == SNP_1_allele_1 and h12 == SNP_1_allele_2) or (h11 == SNP_1_allele_2 and h12 == SNP_1_allele_1)) and h21 == SNP_2_allele_1 and h22 == SNP_2_allele_1])
N31 = len([1 for h11, h12, h21, h22 in Haps if h11 == SNP_1_allele_2 and h12 == SNP_1_allele_2 and h21 == SNP_2_allele_1 and h22 == SNP_2_allele_1])
N12 = len([1 for h11, h12, h21, h22 in Haps if h11 == SNP_1_allele_1 and h12 == SNP_1_allele_1 and ((h21 == SNP_2_allele_1 and h22 == SNP_2_allele_2) or (h21 == SNP_2_allele_2 and h22 == SNP_2_allele_1))])
N22 = len([1 for h11, h12, h21, h22 in Haps if ((h11 == SNP_1_allele_1 and h12 == SNP_1_allele_2) or (h11 == SNP_1_allele_2 and h12 == SNP_1_allele_1)) and ((h21 == SNP_2_allele_1 and h22 == SNP_2_allele_2) or (h21 == SNP_2_allele_2 and h22 == SNP_2_allele_1))])
N32 = len([1 for h11, h12, h21, h22 in Haps if h11 == SNP_1_allele_2 and h12 == SNP_1_allele_2 and ((h21 == SNP_2_allele_1 and h22 == SNP_2_allele_2) or (h21 == SNP_2_allele_2 and h22 == SNP_2_allele_1))])
N13 = len([1 for h11, h12, h21, h22 in Haps if h11 == SNP_1_allele_1 and h12 == SNP_1_allele_1 and h21 == SNP_2_allele_2 and h22 == SNP_2_allele_2])
N23 = len([1 for h11, h12, h21, h22 in Haps if ((h11 == SNP_1_allele_1 and h12 == SNP_1_allele_2) or (h11 == SNP_1_allele_2 and h12 == SNP_1_allele_1)) and h21 == SNP_2_allele_2 and h22 == SNP_2_allele_2])
N33 = len([1 for h11, h12, h21, h22 in Haps if h11 == SNP_1_allele_2 and h12 == SNP_1_allele_2 and h21 == SNP_2_allele_2 and h22 == SNP_2_allele_2])
N1_ = N11 + N12 + N13
N2_ = N21 + N22 + N23
N3_ = N31 + N32 + N33
N_1 = N11 + N21 + N31
N_2 = N12 + N22 + N32
N_3 = N13 + N23 + N33
N = N1_ + N2_ + N3_
pAB = 0.25
pab = 0.25
pAb = 0.25
paB = 0.25
for nn in range(10): # 10 should be enough
nAB = (2*N11) + N12 + N21 + (( (pAB*pab) / ((pAb*paB) + (pAB*pab)) ) * N22)
nab = (2*N33) + N23 + N32 + (( (pAB*pab) / ((pAb*paB) + (pAB*pab)) ) * N22)
nAb = (2*N13) + N12 + N23 + (( (pAb*paB) / ((pAb*paB) + (pAB*pab)) ) * N22)
naB = (2*N31) + N21 + N32 + (( (pAb*paB) / ((pAb*paB) + (pAB*pab)) ) * N22)
pAB = nAB / (2.0 * N)
pab = nab / (2.0 * N)
pAb = nAb / (2.0 * N)
paB = naB / (2.0 * N)
ret["haplotypes"] = [
(SNP_1_allele_1 + SNP_2_allele_1, pAB, freq_A_B),
(SNP_1_allele_1 + SNP_2_allele_2, pAb, freq_A_b),
(SNP_1_allele_2 + SNP_2_allele_1, paB, freq_a_B),
(SNP_1_allele_2 + SNP_2_allele_2, pab, freq_a_b),
]
Observed = [pAB, pAb, paB, pab]
Expected = [freq_A_B, freq_A_b, freq_a_B, freq_a_b]
R_sq = sum([((o-e)*(o-e))/e for o,e in zip(Observed, Expected)])
ret["R_sq"] = R_sq
pA = pAB + pAb
pB = pAB + paB
D = pAB - (pA*pB)
if D>=0:
Dmax = min(pA * (1-pB), (1-pA)*pB)
else:
Dmax = min(pA * pB, (1-pA)*(1-pB))
Dprime = D / Dmax
ret["Dprime"] = Dprime
return ret
def find_snp_index_from_data(snp_id, data):
# THIS IS SLOW!!!
# We can improve it!
for index, d in enumerate(cases):
if d['s_id'] == snp_id:
return index
return None
def ld(cases, controls, snp_1, snp_2, output):
print ('SNP 1:', snp_1)
print ('SNP 2:', snp_2)
output_filename = output + '.ld'
print ('Saving LD to:', output_filename)
snp_1_index = find_snp_index_from_data(snp_1, cases)
snp_2_index = find_snp_index_from_data(snp_2, cases)
snp_1_data = cases[snp_1_index]['g'] + controls[snp_1_index]['g']
snp_2_data = cases[snp_2_index]['g'] + controls[snp_2_index]['g']
ld_result = Pairwise_linkage_disequilibrium(snp_1_data, snp_2_data)
with open(output_filename, 'w') as f:
to_print = [snp_1, snp_2,
ld_result['R_sq'],
ld_result['Dprime'],
] + ["{}:{},{}".format(x[0], x[1], x[2]) for x in ld_result['haplotypes']]
f.write(' '.join(map(str, to_print)) + '\n')
def genetic_association_test(SNPs, phenotypes, tests):
'''
For testing make sure that this method returns the same values as plink
>>> SNPs = [('A','A'), ('A','G'), ('G','G'), ('G','A'), ('G', 'A'), ('A', 'G'), ('G', 'G'), ('A', 'A'), ('G', 'A'), ('A', 'A'), ('A','A'), ('A','G'), ('G','G'), ('G','A'), ('G', 'A'), ('A', 'G'), ('G', 'G'), ('A', 'A'), ('G', 'A'), ('A', 'A')]
>>> phenotypes = [1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1]
>>> tests = ['GENO', 'TREND', 'ALLELIC', 'DOM', 'REC'] # All of them
>>> results = genetic_association_test(SNPs, phenotypes, tests)
>>> ' '.join([' '.join(['%0.4f' % results[t]['CHISQ'], '%0.4f' % results[t]['P']]) for t in tests])
'0.2176 0.6409 0.2198 0.6392 0.5128 0.7738 0.0105 0.9185 0.4945 0.4819'
'''
#Assert that phenotypes and genotypes have the same length
assert len(SNPs) == len(phenotypes)
#Get the two different alleles of this SNP
allele_1, allele_2 = SNP_alleles(SNPs)
#Genotype Contingency table:
n = np.zeros((2 + 1, 3 + 1))
#Allelic Contingency table:
m = np.zeros((2 + 1, 2 + 1))
#Expected values for Genotype Contingency table:
En = np.zeros((2 + 1, 3 + 1))
#Expected values for Allelic Contingency table
Em = np.zeros((2 + 1, 2 + 1))
#Total values for Genotype Contingency table
nS3 = np.zeros(3 + 1)
n2S = np.zeros(2 + 1)
#Total values for Allelic Contingency table
m2S = np.zeros(2 + 1)
mS2 = np.zeros(2 + 1)
samples = float(len(SNPs))
#Build Contingency tables
for SNP, phenotype in zip(SNPs, phenotypes):
count_allele_1 = SNP.count(allele_1)
count_allele_2 = SNP.count(allele_2)
allele_1_1 = 1 if SNP == (allele_1, allele_1) else 0
allele_1_2 = 1 if allele_1 in SNP and allele_2 in SNP else 0
allele_2_2 = 1 if SNP == (allele_2, allele_2) else 0
mS2[1] += count_allele_1
mS2[2] += count_allele_2
m2S[phenotype] += 1
m[phenotype][1] += count_allele_1
m[phenotype][2] += count_allele_2
nS3[1] += allele_1_1
nS3[2] += allele_1_2
nS3[3] += allele_2_2
n2S[phenotype] += 1
n[phenotype][1] += allele_1_1
n[phenotype][2] += allele_1_2
n[phenotype][3] += allele_2_2
results = {}
cochran_armitrage_tests = ['TREND', 'DOM', 'REC']
for test in tests:
if test == 'ALLELIC':
chi_square = 0.0
for i in range(1,3):
for j in range(1,4):
En[i][j] = (n2S[i] * nS3[j]) / samples
if En[i][j] > 0.0:
chi_square += ((n[i][j] - En[i][j]) ** 2) / En[i][j]
p_value = stats.chisqprob(chi_square, 2)
elif test == 'GENO':
chi_square = 0.0
for i in range(1,3):
for j in range(1,3):
Em[i][j] = (m2S[i] * mS2[j]) / samples
if Em[i][j] > 0.0:
chi_square += ((m[i][j] - Em[i][j]) ** 2) / Em[i][j]
p_value = stats.chisqprob(chi_square, 1)
elif test in cochran_armitrage_tests:
w = [[0,0,1,2], [0,0,1,1], [0,0,0,1]][cochran_armitrage_tests.index(test)]
#Nominator
nom = 0.0
for i in range(1,4):
nom += w[i] * (n[1][i] * n2S[2] - n[2][i] * n2S[1])
nom = nom ** 2
#First part of denominator
den1 = 0.0
for i in range(1,4):
den1 += (w[i]**2) * nS3[i] * (samples - nS3[i])
#Second part of denominator
den2 = 0.0
for i in range(1,3):
for j in range(i+1,4):
den2 += w[i] * w[j] * nS3[i] * nS3[j]
den2 = 2.0 * den2
#Third part of denominator
den3 = (n2S[1] * n2S[2]) / samples
#The denominator
den = den3 * (den1 - den2)
chi_square = nom / den
p_value = stats.chisqprob(chi_square, 1)
else:
raise Exception('Unknown test name: "%s"' % (str(test)))
results[test] = {'CHISQ': chi_square, 'P' : p_value}
return results
def association_test(cases, controls, output, test_names):
output_filename = output + '.association'
print ('\nSaving significance testing to file:', output_filename)
if not test_names:
tests = ['GENO']
else:
tests = list(set(test_names))
print ('Performing the following tests: ', ' '.join(tests))
with open(output_filename, 'w') as f:
header = ['snp_id', 'location']
for test_name in tests:
header.extend([test_name + '_pvalue', test_name + '_chisquare'])
#Write header
f.write(' '.join(header) + '\n')
bar = progressbar.ProgressBar(max_value=len(cases)+1)
line_counter = 0
for case, control in zip(cases, controls):
line_counter += 1
if line_counter % 1000 == 0:
bar.update(line_counter)
genotypes = case['g'] + control['g']
phenotypes = [2] * len(case['g']) + [1] * len(control['g'])
test_result = genetic_association_test(genotypes, phenotypes, tests)
to_print = [case['s_id'], case['l']]
for test_name in tests:
to_print.extend([test_result[test_name]['P'], test_result[test_name]['CHISQ']])
f.write(' '.join(map(str, to_print)) + '\n')
def plot_manhattan(location, p_value, output_filename, title='Manhattan plot'):
plt.plot(location, -np.log(p_value), '.', markersize=5, color='black')
plt.xlabel('Location')
plt.ylabel('-log(p)')
plt.title(title)
#Plot significance threshold as a red line
plt.plot([location[0], location[-1]], [6, 6], color='red')
plt.xlim(location[0], location[-1])
png = output_filename + '.png'
print ('\nSaving to:', png)
plt.savefig(png)
plt.show()
def ppoints(n):
'''
Same as R's ppoints
http://stat.ethz.ch/R-manual/R-patched/library/stats/html/ppoints.html
Also check: http://en.wikipedia.org/wiki/Q%E2%80%93Q_plot#Heuristics
'''
if n <= 10:
a = 3.0/8.0
else:
a = 0.5
return (np.array(range(1,n+1)) - a) / (n + 1 - 2.0*a)
def plot_qqplot(p_values, output_filename, title='QQ plot'):
'''
Create a qq-plot for p values from significance testing.
'''
observed = np.sort(p_values)
observed = -np.log10(observed)
expected = -np.log10(ppoints(len(observed)))
x_label = 'expected -log(10)'
y_label = 'observed -log(10)'
l = np.median(observed)/-np.log10(0.5)
xmax = max(expected[np.invert(np.isinf(expected))]) + 1
ymax = max(observed[np.invert(np.isinf(observed))]) + 1
xymax = max(xmax, ymax)
fig, ax = plt.subplots()
#Set labels
ax.set_xlabel(x_label)
ax.set_ylabel(y_label)
ax.set_title(title)
#plot ab line
ax.plot([-0.2, xymax], [-0.2, xymax], color='red', linewidth=2.0)
#Plot qq plot
ax.plot(expected, observed, '.')
png = output_filename + '.png'
print ('\nSaving plot to:', png)
plt.savefig(png)
plt.show()
def read_pvalues(filename, test_name):
locations = []
pvalues = []
with open(filename) as f:
header = f.readline().replace('\n', '').split()
p_column_name = test_name + '_pvalue'
if not p_column_name in header:
raise Exception('Column {} does not exit in file: {}'.format(p_column_name, filename))
p_column_index = header.index(p_column_name)
location_index = header.index('location')
bar = progressbar.ProgressBar(max_value=progressbar.UnknownLength)
line_counter = 0
for line in f:
line_counter += 1
if line_counter % 10000 == 0:
bar.update(line_counter)
line_splitted = line.replace('\n', '').split()
location = int(line_splitted[location_index])
p_value = float(line_splitted[p_column_index])
locations.append(location)
pvalues.append(p_value)
return locations, pvalues
def manhattan(options, output):
manhattan_filename = options[0]
manhattan_test = options[1]
locations, pvalues = read_pvalues(manhattan_filename, manhattan_test)
plot_manhattan(locations, pvalues, output + '.'+ manhattan_test + '.manhattan',
title='{} Manhattan plot'.format(manhattan_test))
def qqplot(options, output):
qq_filename = options[0]
qq_test = options[1]
locations, pvalues = read_pvalues(qq_filename, qq_test)
plot_qqplot(pvalues,
output_filename = output + '.' + qq_test + '.qqplot',
title = '{} QQ plot'.format(qq_test))
def parse_vep_response(response):
ret = {}
for r in response:
if 'transcript_consequences' in r:
for transcript_consequence in r['transcript_consequences']:
transcript_id = transcript_consequence['transcript_id']
ret[transcript_id] = {}
if 'polyphen_prediction' in transcript_consequence:
ret[transcript_id]['polyphen_prediction'] = transcript_consequence['polyphen_prediction']
if 'sift_prediction' in transcript_consequence:
ret[transcript_id]['sift_prediction'] = transcript_consequence['sift_prediction']
return ret
def hgvs_formatter(chromosome, location, reference, alternative):
return "{}:g.{}{}>{}".format(chromosome, location, reference, alternative)
def get_info_hgvs(hgvs_l):
'''
POST request
'''
url = 'http://rest.ensembl.org/vep/human/hgvs'
headers = { "Content-Type" : "application/json", "Accept" : "application/json"}
data = { "hgvs_notations" : hgvs_l}
data_json = json.dumps(data)
r = requests.post(url, headers=headers, data=data_json)
if not r.ok:
return None
data = r.json()
return parse_vep_response(data)
def get_info_rs(rs_variant_l):
'''
POST request
'''
url = 'http://rest.ensembl.org/vep/human/id'
headers = { "Content-Type" : "application/json", "Accept" : "application/json"}
data = { "ids" : rs_variant_l }
data_json = json.dumps(data)
r = requests.post(url, headers=headers, data=data_json)
if not r.ok:
return None
data = r.json()
#print (data)
return parse_vep_response(data)
def print_get_info_results(get_info_results):
'''
Nicely pring get_info results
'''
print (json.dumps(get_info_results, indent=4))
def is_rs(variant):
return re.match(r'rs[\d]+', variant)
def get_info(cases, controls, variants_index):
ret = {}
total = len(variants_index)
bar = progressbar.ProgressBar(max_value=len(variants_index))
for variant_counter, variant_index in enumerate(variants_index):
bar.update(variant_counter)
#Check which variants are rs and which are hgvs
rs_id = cases[variant_index]['r_id']
#print ('Running get_info for: {} {}/{}'.format(rs_id, variant_counter+1, total))
if is_rs(rs_id):
get_info_results = get_info_rs([rs_id])
else:
s = re.match(r'([\d]+)-([\d]+)', rs_id)
if not s:
raise Exception('Could not parse rs_id: {}'.format(rs_id))
chromosome = s.group(1)
location = int(s.group(2))
liftovered_location = do_liftover(chromosome, location)
reference = cases[variant_index]['r']
alternative = cases[variant_index]['a']
this_hgvs = hgvs_formatter(chromosome, liftovered_location, reference, alternative)
get_info_results = get_info_hgvs([this_hgvs])
#Check returned data
if get_info_results == {}:
get_info_results = 'WARNING!!!! get_info did not return consequences information for {}'.format(rs_id)
elif get_info_results is None:
get_info_results = 'WARNING!!!! get_info failed for variant: {}'.format(rs_id)
else:
ret[variant_index] = get_info_results
#print_get_info_results(results)
return ret
def do_liftover(chromosome, position):
if not lo:
return position
lo_result = lo.convert_coordinate('chr{}'.format(chromosome), position)
if not lo_result:
print ('WARNING!! Could not liftover: {}:{}'.format(chromosome, position))
return None
if len(lo_result) > 1:
# Is this possible???
print ('WARNING! Liftover returned more than one positions for {}:{}'.format(chromosome, position))
return lo_result[0][1]
def setup_liftover(from_assembly, to_assembly):
global lo
print ('Setting up liftover..')
lo = LiftOver(from_assembly, to_assembly)
print ('...Done')
def plot_distribution(data, bins=50, xlabel='', ylabel='', title='', output_filename=None):
'''
http://matplotlib.org/1.2.1/examples/pylab_examples/histogram_demo.html
'''
n, bins, patches = plt.hist(data, bins)
plt.xlabel(xlabel)
plt.ylabel(ylabel)
plt.title(title)
if output_filename:
png = output_filename + '.png'
print ('Saving to ', png)
plt.savefig(png)
plt.show()
def maf_from_frequency_file(filename):
frequencies = []
print ('Reading file:', filename)
with open(filename) as f:
for line in f:
line_splitted = line.replace('\n', '').split()
fr_ref = float(line_splitted[-2])
fr_alt = float(line_splitted[-1])
maf = min(fr_ref, fr_alt)
frequencies.append(maf)
return frequencies
def p_from_hwe_file(filename):
print ('Reading file:', filename)
all_p = []
with open(filename) as f:
for line in f:
line_splitted = line.replace('\n', '').split()
p = float(line_splitted[-1])
all_p.append(p)
return all_p
def allele_frequency_plot(filename):
'''
Plots the distribution of an allele frequency file
'''
frequencies = maf_from_frequency_file(filename)
plot_distribution(frequencies,
bins=100,
xlabel='Minor Allele Frequencies',
ylabel='Counts',
output_filename = filename)
def hwe_plot(filename):
all_p = p_from_hwe_file(filename)
plot_distribution(all_p,
bins=100,
xlabel='HWE p-values',
ylabel='Counts',
output_filename=filename)
def maf_filter(cases, controls, output, maf_filter):
print ('\nFiltering with MAF>{}'.format(maf_filter))
print ('Calculate frequencies..')
frequencies_filename = allele_frequency(cases, controls, output)
print ('Read file with frequencies..')
frequencies = maf_from_frequency_file(frequencies_filename)
print ('Apply filter..')
removed = 0
total = 0
with open('remove_snps.tmp', 'w') as f:
for case, frequency in zip(cases, frequencies):
total += 1
if frequency<maf_filter:
removed += 1
f.write(case['s_id'] + '\n')
print ('Removed: {} out from {} SNPs'.format(removed, total))
print ('Create new data..')
remove_snps(cases, controls, output, 'remove_snps.tmp')
def hwe_filter(cases, controls, output, hwe_filter, test):
print ('\nFiltering with HWE>{}'.format(hwe_filter))
print ('Calculate hwe..')
hwe_filename = hwe(cases, controls, output, test)
print ('\nReading file with p-values: {}'.format(hwe_filename))
all_p = p_from_hwe_file(hwe_filename)
print ('Apply filter..')
removed = 0
total = 0
with open('remove_snps.tmp', 'w') as f:
for case, p in zip(cases, all_p):
total += 1
if p < hwe_filter:
removed += 1
f.write(case['s_id'] + '\n')
print ('Removed: {} out from {} SNPs'.format(removed, total))
print ('Create new data..')
remove_snps(cases, controls, output, 'remove_snps.tmp')
def p_value_generator(filename, test_name):
'''
This is a generator: https://wiki.python.org/moin/Generators
'''
print ('\nOpening file:', filename)
with open(filename) as f:
header = f.readline().replace('\n', '').split()
p_column_name = test_name + '_pvalue'
if not p_column_name in header:
raise Exception('{} column does not exist in {}'.format(p_column_name, filename))
p_column_index = header.index(p_column_name)
line_counter = 0
for line in f:
line_counter += 1
if line_counter % 10000 == 0:
pass
#print ('Line counter: ', line_counter)
line_splitted = line.replace('\n', '').split()
p_value = float(line_splitted[p_column_index])
#yield line_counter, p_value
yield p_value
def find_peaks(data, window_size, significance_level):
peaks = []
for i in range(0, len(data), window_size):
window = -np.log(data[i : i+window_size])
window_index = i + int(np.argmax(window)) # The int() part is to get over with this issue: https://bugs.python.org/issue24313
# Yep.. even python has its quirks..
window_max = np.max(window)
if window_max < significance_level:
continue # This is not a peak
#There is an interesting peak. # Keep it!
peaks.append({'index': window_index, 'max': window_max})
return peaks
def get_data_point(cases, controls, index):
return cases[index]['g'] + controls[index]['g']
def disequilibrium_criterion(ld_result, dprime_threshod=0.98, r_sq_threshold=0.2):
'''
http://www.nature.com/nrg/journal/v4/n8/full/nrg1123.html
'''
if np.abs(ld_result['Dprime']) > dprime_threshod:
return True
if ld_result['R_sq'] > r_sq_threshold:
return True
return False
def get_independent_peaks(cases, controls, peaks):
iterations = 0
while True:
iterations += 1
new_peaks = []
change_happened = False
for peak_index in range(0, len(peaks), 2):
peak_1 = peaks[peak_index]
if peak_index+1 == len(peaks):
continue
peak_2 = peaks[peak_index+1]
distance = peak_2['index'] - peak_1['index']
genotypes_1 = get_data_point(cases, controls, peak_1['index'])
genotypes_2 = get_data_point(cases, controls, peak_2['index'])
ld_result = Pairwise_linkage_disequilibrium(genotypes_1, genotypes_2)
are_in_LD = disequilibrium_criterion(ld_result)
if are_in_LD:
new_peaks.append(peak_1)
change_happened = True
else:
new_peaks.append(peak_1)
new_peaks.append(peak_2)
print ('Iteration: {}. Peaks: {}'.format(iterations, len(new_peaks)))
peaks = new_peaks
if not change_happened:
# Our work here is done
break
return peaks
def regional_ld_snps(cases, control, index):
ret = []
peak_genotypes = get_data_point(cases, controls, index)
for direction in [-1, +1]:
#print ('Direction: {}'.format({-1:'-', 1:'+'}[direction]))
compare_with = index + direction
last_snp_in_ld = index
while True:
compare_with_genotype = get_data_point(cases, controls, compare_with)
ld_result = Pairwise_linkage_disequilibrium(peak_genotypes, compare_with_genotype)
if disequilibrium_criterion(ld_result):
#print ('{} - {} LD!'.format(peak_counter, compare_with))
ret.append(compare_with)
last_snp_in_ld = compare_with
else:
#What is the distance between this and the last distance in LD?
distance = np.abs(compare_with - last_snp_in_ld)
#print ('{} - {} NOT LD. DISTANCE: {}'.format(peak_counter, compare_with, distance))
if distance > 100: # 1000 returned the same results.
break
compare_with += direction # move along
return ret
def independent(cases, controls, output, independent_arg):
'''
This function takes as input the result if a statistical test analysis
and locates the independent signals
'''
filename = independent_arg[0]
test_name = independent_arg[1]
p_values = list(p_value_generator(filename, test_name))
peaks = find_peaks(p_values, 100, 8)
print ('Found {} potential interesting significance peaks!'.format(len(peaks)))
ind_peaks = get_independent_peaks(cases, controls, peaks)
print ('Independent peaks: {}'.format(len(ind_peaks)))
peak_lds = {}
for peak_counter, peak in enumerate(ind_peaks):
print ('Ivestigating peak: {}'.format(peak_counter+1))
peak_index = peak['index']
peak_lds[peak_index] = regional_ld_snps(cases, controls, peak_index)
print ('Peak {}. Found {} SNPs in LD'.format(peak_counter, len(peak_lds[peak_index])))
output_filename = output + '.independent'
print ('Saving results to {}'.format(output_filename))
with open(output_filename, 'w') as f:
json.dump(peak_lds, f, indent=4)
def get_info_independent(cases, controls, output, input_filename):
print ('Reading:', input_filename)
to_save = {}
with open(input_filename) as f:
peak_lds = json.load(f)
peak_lds = {int(k):v for k,v in peak_lds.items()} # JSON stores keys as strings..
for peak_counter, peak in enumerate(peak_lds):
print ('\nGetting info for peak: {}/{}'.format(peak_counter+1, len(peak_lds)))
peak_id = cases[peak]['r_id']
peak_location = cases[peak]['l']
peak_info = get_info(cases, controls, peak_lds[peak])
to_save[peak_id] = {
'location': peak_location,
'info': peak_info,
}
output_filename = output + '.get_info'
print ('\nSaving results to:', output_filename)
with open(output_filename, 'w') as f:
json.dump(to_save, f, indent=4)
if __name__ == '__main__':
'''
Example default run
/Users/alexandroskanterakis/anaconda3/bin/python project.py --controls_file gwas.controls.gen --cases_file gwas.cases.gen --output output
/Users/alexandroskanterakis/anaconda3/bin/python project.py --controls_file gwas.controls.gen --cases_file gwas.cases.gen --output output --keep_snps example.keep.snps
/Users/alexandroskanterakis/anaconda3/bin/python project.py --controls_file gwas.controls.gen --cases_file gwas.cases.gen --output output --remove_snps example.keep.snp
/Users/alexandroskanterakis/anaconda3/bin/python project.py --controls_file gwas.controls.gen --cases_file gwas.cases.gen --output output --keep_samples example.keep.samples
/Users/alexandroskanterakis/anaconda3/bin/python project.py --controls_file gwas.controls.gen --cases_file gwas.cases.gen --output output --remove_samples example.keep.samples
time /Users/alexandroskanterakis/anaconda3/bin/python project.py --controls_file gwas.controls.gen --cases_file gwas.cases.gen --output output --allele_frequency
time /Users/alexandroskanterakis/anaconda3/bin/python project.py --allele_frequency_plot output.frequency
time /Users/alexandroskanterakis/anaconda3/bin/python project.py --controls_file gwas.controls.gen --cases_file gwas.cases.gen --output output_maf005 --maf_filter 0.05
time /Users/alexandroskanterakis/anaconda3/bin/python project.py --controls_file gwas.controls.gen --cases_file gwas.cases.gen --output output --hwe_asymptotic
time /Users/alexandroskanterakis/anaconda3/bin/python project.py --controls_file gwas.controls.gen --cases_file gwas.cases.gen --output output --hwe_exact
time /Users/alexandroskanterakis/anaconda3/bin/python project.py --hwe_plot output_asymptotic.hwe_asymptotic
time /Users/alexandroskanterakis/anaconda3/bin/python project.py --controls_file gwas.controls.gen --cases_file gwas.cases.gen --output output_hwe_asymptotic_0001 --hwe_asymptotic_filter 0.05
time /Users/alexandroskanterakis/anaconda3/bin/python project.py --controls_file gwas.controls.gen --cases_file gwas.cases.gen --output output_hwe_exact_0001 --hwe_exact_filter 0.05
time /Users/alexandroskanterakis/anaconda3/bin/python project.py --controls_file gwas.controls.gen --cases_file gwas.cases.gen --output output --ld snp_5 snp_7
time /Users/alexandroskanterakis/anaconda3/bin/python project.py --controls_file gwas.controls.gen --cases_file gwas.cases.gen --output output --association_test
time /Users/alexandroskanterakis/anaconda3/bin/python project.py --controls_file gwas.controls.gen --cases_file gwas.cases.gen --output output --manhattan output.association GENO
time /Users/alexandroskanterakis/anaconda3/bin/python project.py --controls_file gwas.controls.gen --cases_file gwas.cases.gen --output output --qqplot output.association GENO
time /Users/alexandroskanterakis/anaconda3/bin/python project.py --controls_file gwas.controls.gen --cases_file gwas.cases.gen --output output --liftover hg18 hg38 --get_info rs4814683
time /Users/alexandroskanterakis/anaconda3/bin/python project.py --controls_file gwas.controls.gen --cases_file gwas.cases.gen --output output --liftover hg18 hg38 --get_info snp_1
'''
argparser = argparse.ArgumentParser(description='Python project https://gist.github.com/kantale/24f4f02d5b87ce0146e0455791e71aeb ')
argparser.add_argument('--controls_file', default=None, required = False)
argparser.add_argument('--cases_file', default=None, required=False)
argparser.add_argument('--output')
argparser.add_argument('--keep_snps', default=None, required = False)
argparser.add_argument('--remove_snps', default=None, required = False)
argparser.add_argument('--keep_samples', default=None, required=False)
argparser.add_argument('--remove_samples', default=None, required=False)
argparser.add_argument('--allele_frequency', action='store_true', required=False)
argparser.add_argument('--allele_frequency_plot', required=False)
argparser.add_argument('--maf_filter', default=None, type=float, required=False)
argparser.add_argument('--hwe_asymptotic', action='store_true', required=False)
argparser.add_argument('--hwe_exact', action='store_true', required=False)
argparser.add_argument('--hwe_plot', default=None, required=False)
argparser.add_argument('--hwe_asymptotic_filter', default=None, type=float, required=False)
argparser.add_argument('--hwe_exact_filter', default=None, type=float, required=False)
argparser.add_argument('--ld', nargs=2, required=False)
argparser.add_argument('--association_test', nargs='*', required=False)
argparser.add_argument('--manhattan', nargs=2, required=False)
argparser.add_argument('--qqplot', nargs=2, required=False)
argparser.add_argument('--get_info', default=None, required=False)
argparser.add_argument('--liftover', nargs=2, required=False)
argparser.add_argument('--independent', nargs=2, default=None, required=False)
argparser.add_argument('--get_info_independent', default=None, required=False)
options = argparser.parse_args()
cases = None
controls = None
output = options.output
if not options.controls_file is None and not options.cases_file is None:
cases, controls = read_input(options)
#Setup liftover object
if options.liftover:
print ('\nSetting up liftover object from:{} to:{}'.format(*options.liftover))
setup_liftover(options.liftover[0], options.liftover[1])
if not options.keep_snps is None:
keep_snps(cases, controls, output, options.keep_snps)
if not options.remove_snps is None:
remove_snps(cases, controls, output, options.remove_snps)
if not options.keep_samples is None:
keep_samples(cases, controls, output, options.keep_samples)
if not options.remove_samples is None:
remove_samples(cases, controls, output, options.remove_samples)
if options.allele_frequency:
allele_frequency(cases, controls, output)
if options.allele_frequency_plot:
allele_frequency_plot(options.allele_frequency_plot)
if not options.maf_filter is None:
maf_filter(cases, controls, output, options.maf_filter)
if options.hwe_asymptotic:
hwe(cases, controls, output, 'asymptotic')
if options.hwe_exact:
hwe(cases, controls, output, 'exact')
if options.hwe_plot:
hwe_plot(options.hwe_plot)
if not options.hwe_asymptotic_filter is None:
hwe_filter(cases, controls, output, options.hwe_asymptotic_filter, 'asymptotic')
if not options.hwe_exact_filter is None:
hwe_filter(cases, controls, output, options.hwe_exact_filter, 'exact')
if options.ld:
ld(cases, controls, options.ld[0], options.ld[1], output)
if not options.association_test is None:
association_test(cases, controls, output, options.association_test)
if not options.manhattan is None:
manhattan(options.manhattan, output)
if not options.qqplot is None:
qqplot(options.qqplot, output)
if not options.get_info is None:
get_info(cases, controls, [options.get_info])
if not options.independent is None:
independent(cases, controls, output, options.independent)
if not options.get_info_independent is None:
get_info_independent(cases, controls, output, options.get_info_independent)
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment