Last active
November 13, 2017 13:10
-
-
Save kantale/87b4e1b24abf9be85e44fcd7f8324639 to your computer and use it in GitHub Desktop.
Implementation of python project: https://gist.github.com/kantale/24f4f02d5b87ce0146e0455791e71aeb
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| ''' | |
| Copyright (c) 2017, Alexandros Kanterakis kantale@ics.forth.gr | |
| All rights reserved. | |
| Simplified BSD License: | |
| Redistribution and use in source and binary forms, with or without | |
| modification, are permitted provided that the following conditions are met: | |
| 1. Redistributions of source code must retain the above copyright notice, this | |
| list of conditions and the following disclaimer. | |
| 2. Redistributions in binary form must reproduce the above copyright notice, | |
| this list of conditions and the following disclaimer in the documentation | |
| and/or other materials provided with the distribution. | |
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND | |
| ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED | |
| WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE | |
| DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR | |
| ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES | |
| (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; | |
| LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND | |
| ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
| (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS | |
| SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
| ''' | |
| import re | |
| import os | |
| import json | |
| import time | |
| import numpy as np | |
| import argparse | |
| import requests | |
| import progressbar # | |
| from pyliftover import LiftOver | |
| import matplotlib.pyplot as plt | |
| from scipy import stats | |
| # global liftover variable | |
| lo = None | |
| def print_now(): | |
| ''' | |
| prints the current time. | |
| useful for progress information | |
| ''' | |
| return time.strftime("%a, %d %b %Y %H:%M:%S", time.gmtime()) | |
| def load(filename): | |
| print ('Opening file:', filename) | |
| data_to_return = [] | |
| with open(filename) as f: | |
| line_counter = 0 | |
| bar = progressbar.ProgressBar(max_value=progressbar.UnknownLength) | |
| for line in f: | |
| line_counter += 1 | |
| # if line_counter > 10000: | |
| # break | |
| if line_counter % 1000 == 0: | |
| bar.update(line_counter) | |
| line_splitted = line.replace('\n', '').split() | |
| snp_id = line_splitted[0] | |
| rs_id = line_splitted[1] | |
| location = int(line_splitted[2]) | |
| reference = line_splitted[3] | |
| alternative = line_splitted[4] | |
| this_variant_hash = { | |
| ('1','0','0'): (reference, reference), | |
| ('0','1','0'): (reference, alternative), | |
| ('0','0','1'): (alternative, alternative) | |
| } | |
| numeric_genotypes = line_splitted[5:] | |
| #Xwrizoume to all_genotypes se triades kai metatrepoume ta (0,0,1) se (reference, reference), ktl | |
| allelic_genotypes = [this_variant_hash[(numeric_genotypes[i], numeric_genotypes[i+1], numeric_genotypes[i+2])] for i in range(0, len(numeric_genotypes), 3)] | |
| this_variant = { | |
| 's_id': snp_id, | |
| 'r_id': rs_id, | |
| 'l': location, | |
| 'r': reference, | |
| 'a': alternative, | |
| 'g': allelic_genotypes, | |
| } | |
| data_to_return.append(this_variant) | |
| return data_to_return | |
| def save(data, filename): | |
| ''' | |
| Save data | |
| ''' | |
| print ('\nSaving data to:', filename) | |
| with open(filename, 'w') as f: | |
| line_counter = 0 | |
| bar = progressbar.ProgressBar(max_value=len(data)+1) | |
| for d in data: | |
| line_counter += 1 | |
| if line_counter % 1000 == 0: | |
| bar.update(line_counter) | |
| line = [ | |
| d['s_id'], d['r_id'], str(d['l']), d['r'], d['a'] | |
| ] | |
| this_variant_hash = { | |
| (d['r'], d['r']) : ['1', '0', '0'], | |
| (d['r'], d['a']) : ['0', '1', '0'], | |
| (d['a'], d['a']) : ['0', '0', '1'], | |
| } | |
| for alleleic_genotype in d['g']: | |
| line.extend(this_variant_hash[alleleic_genotype]) | |
| f.write(' '.join(line) + '\n') | |
| def read_input(options): | |
| print ('\nLOADING CASES:') | |
| cases = load(options.cases_file) | |
| print ('\nLOADING CONTROLS:') | |
| controls = load(options.controls_file) | |
| return cases, controls | |
| def keep_snps_data(data, keep_snps, output_filename): | |
| data_new = [x for x in data if x['s_id'] in keep_snps] | |
| save(data_new, output_filename) | |
| def remove_snps_data(data, remove_snps, output_filename): | |
| data_new = [x for x in data if not x['s_id'] in remove_snps] | |
| save(data_new, output_filename) | |
| def read_snp_list(filename): | |
| with open(filename) as f: | |
| snps = {x.replace('\n', ''): None for x in f.readlines()} | |
| print ('Read {} SNPs from {}'.format(len(snps), filename)) | |
| return snps | |
| def keep_snps(cases, controls, output, keep_snps_filename): | |
| keep_snps = read_snp_list(keep_snps_filename) | |
| keep_snps_data(cases, keep_snps, output + '.keep_snps.cases') | |
| keep_snps_data(controls, keep_snps, output + '.keep_snps.controls') | |
| def remove_snps(cases, controls, output, remove_snps_filename): | |
| remove_snps = read_snp_list(remove_snps_filename) | |
| remove_snps_data(cases, remove_snps, output + '.remove_snps.cases') | |
| remove_snps_data(controls, remove_snps, output + '.remove_snps.controls') | |
| def read_sample_list(filename): | |
| sample_ids = { | |
| 'cases': [], | |
| 'controls': [], | |
| } | |
| with open(filename) as f: | |
| for line in f: | |
| sample = line.replace('\n', '') | |
| sample_splitted = sample.split('_') | |
| if len(sample_splitted) != 2: | |
| raise Exception('Unknown sample format: {}'.format(sample)) | |
| cases_controls = sample_splitted[0] | |
| if not cases_controls in ['case', 'control']: | |
| raise Exception('First part of sample id sould be case or control') | |
| cases_controls += 's' # control --> controls, case --> cases | |
| try: | |
| case_id = int(sample_splitted[1]) | |
| except NameError as e: | |
| print ('Second part of sample id should be an integer') | |
| raise e | |
| sample_ids[cases_controls].append(case_id-1) | |
| # Remove possible duplicate | |
| for case_control in sample_ids: | |
| sample_ids[case_control] = list(set(sample_ids[case_control])) | |
| print ('Read {} case samples and {} control samples from {}'.format(len(sample_ids['cases']), len(sample_ids['controls']), filename)) | |
| return sample_ids | |
| def keep_sample_data(data, samples_to_keep, output_filename): | |
| for d in data: | |
| d['g'] = [g for i, g in enumerate(d['g']) if i in samples_to_keep] | |
| save(data, output_filename) | |
| def keep_samples(cases, controls, output, keep_samples_filename): | |
| samples = read_sample_list(keep_samples_filename) | |
| keep_sample_data(cases, samples['cases'], output + '.keep_samples.cases') | |
| keep_sample_data(controls, samples['controls'], output + '.keep_samples.controls') | |
| def remove_sample_data(data, samples_to_remove, output_filename): | |
| for d in data: | |
| d['g'] = [g for i, g in enumerate(d['g']) if not i in samples_to_remove] | |
| save(data, output_filename) | |
| def remove_samples(cases, controls, output, remove_samples_filename): | |
| samples = read_sample_list(remove_samples_filename) | |
| remove_sample_data(cases, samples['cases'], output + '.remove_samples.cases') | |
| remove_sample_data(controls, samples['controls'], output + '.remove_samples.controls') | |
| def allele_frequency(cases, controls, output): | |
| output_filename = output + '.frequency' | |
| print ('\nSaving frequencies to:', output_filename) | |
| with open(output_filename, 'w') as f: | |
| line_counter = 0 | |
| bar = progressbar.ProgressBar(max_value=len(cases)+1) | |
| for case, control in zip(cases, controls): | |
| line_counter += 1 | |
| if line_counter % 1000 == 0: | |
| bar.update(line_counter) | |
| to_print = [case['s_id']] | |
| reference = case['r'] | |
| alternative = case['a'] | |
| all_cases = len(case['g']) | |
| all_controls = len(control['g']) | |
| # H suxnothta tou reference sta controls | |
| ref_controls = sum([genotype.count(reference) for genotype in control['g']]) | |
| #/ (2.0 * len(control['g'])) | |
| # H suxnothta tou alternative sta controls | |
| alt_controls = sum([genotype.count(alternative) for genotype in control['g']]) | |
| #/ (2.0 * len(control['g'])) | |
| # H suxnothta tou reference sta cases | |
| ref_cases = sum([genotype.count(reference) for genotype in case['g']]) | |
| #/ (2.0 * len(case['g'])) | |
| # H suxnothta tou alternative sta cases | |
| alt_cases = sum([genotype.count(alternative) for genotype in case['g']]) | |
| #/ (2.0 * len(case['g'])) | |
| # H suxnotuta tou reference sta controls + cases | |
| ref_all = ref_controls + ref_cases | |
| # H suxnothta tou alternative sta controls + cases | |
| alt_all = alt_controls + alt_cases | |
| to_print.extend([ | |
| ref_controls / (2.0 * all_controls), | |
| alt_controls / (2.0 * all_controls), | |
| ref_cases / (2.0 * all_cases), | |
| alt_cases / (2.0 * all_cases), | |
| ref_all / (2.0 * (all_controls+all_cases)), | |
| ]) | |
| f.write(' '.join(map(str, to_print)) + '\n') | |
| return output_filename | |
| def Hardy_Weinberg_Equilibrium_asymptotic_test(obs_hets, obs_hom1, obs_hom2): | |
| ''' | |
| Compute the Hardy-Weinberg Equilibrium of a SNP by using an asymptotic test. | |
| This is equivalent with the value produced with the --hardy2 option of plink | |
| software (http://pngu.mgh.harvard.edu/~purcell/plink/summary.shtml#hardy). | |
| ''' | |
| xx = obs_hom1 | |
| xy = obs_hets | |
| yy = obs_hom2 | |
| if xy == 0 and yy == 0: | |
| return 0.0 | |
| if xx ==0 and xy == 0: | |
| return 0.0 | |
| samples = xx + xy + yy | |
| observed_heterozygosity = float(xy) / samples | |
| alleleX_count = (2*xx) + xy | |
| alleleX_ratio = float(alleleX_count) / (2 * samples) | |
| alleleY_count = xy + (2*yy) | |
| alleleY_ratio = float(alleleY_count) / (2 * samples) | |
| expected_xx_n = alleleX_ratio * alleleX_ratio * samples | |
| expected_xy_n = 2.0 * alleleX_ratio * alleleY_ratio * samples | |
| expected_yy_n = alleleY_ratio * alleleY_ratio * samples | |
| expected_total = expected_xx_n + expected_xy_n + expected_yy_n | |
| expected_xx_ratio = float(expected_xx_n) / expected_total | |
| expected_xy_ratio = float(expected_xy_n) / expected_total | |
| expected_yy_ratio = float(expected_yy_n) / expected_total | |
| chi_square_xx = ((xx - expected_xx_n) ** 2.0) / expected_xx_n | |
| chi_square_xy = ((xy - expected_xy_n) ** 2.0) / expected_xy_n | |
| chi_square_yy = ((yy - expected_yy_n) ** 2.0) / expected_yy_n | |
| chi_square_total = chi_square_xx + chi_square_xy + chi_square_yy | |
| return stats.chisqprob(chi_square_total, 1) | |
| def Hardy_Weinberg_Equilibrium_exact_test(obs_hets, obs_hom1, obs_hom2): | |
| ''' | |
| According to the comments: | |
| This code implements an exact SNP test of Hardy-Weinberg Equilibrium as described in Wigginton, JE, Cutler, DJ, and Abecasis, GR (2005) A Note on Exact Tests of Hardy-Weinberg Equilibrium. | |
| American Journal of Human Genetics. 76: 000 - 000 | |
| Written by Jan Wigginton | |
| Converted to python from the C/C++ code from: http://www.sph.umich.edu/csg/abecasis/Exact/c_instruct.html from Alexandros Kanterakis (kantale@ics.forth.gr) | |
| This test is used by plink when applying the --hardy option (http://pngu.mgh.harvard.edu/~purcell/plink/summary.shtml#hardy) and from Beagle utilities (http://faculty.washington.edu/browning/beagle_utilities/utilities.html) to compute Hardy-Weinberg Equilibrium. | |
| ''' | |
| if obs_hom1 < 0 or obs_hom2 < 0 or obs_hets < 0: | |
| raise Exception("FATAL ERROR - SNP-HWE: Current genotype configuration (%s %s %s) includes negative count" % (obs_hets, obs_hom1, obs_hom2)) | |
| obs_homc = obs_hom2 if obs_hom1 < obs_hom2 else obs_hom1 | |
| obs_homr = obs_hom1 if obs_hom1 < obs_hom2 else obs_hom2 | |
| rare_copies = 2 * obs_homr + obs_hets | |
| genotypes = obs_hets + obs_homc + obs_homr | |
| het_probs = [0.0] * (rare_copies + 1) | |
| #start at midpoint | |
| mid = rare_copies * (2 * genotypes - rare_copies) // (2 * genotypes) | |
| #check to ensure that midpoint and rare alleles have same parity | |
| if (rare_copies & 1) ^ (mid & 1): | |
| mid += 1 | |
| curr_hets = mid | |
| curr_homr = (rare_copies - mid) // 2 | |
| curr_homc = genotypes - curr_hets - curr_homr | |
| het_probs[mid] = 1.0 | |
| sum = float(het_probs[mid]) | |
| for curr_hets in range(mid, 1, -2): | |
| het_probs[curr_hets - 2] = het_probs[curr_hets] * curr_hets * (curr_hets - 1.0) / (4.0 * (curr_homr + 1.0) * (curr_homc + 1.0)) | |
| sum += het_probs[curr_hets - 2]; | |
| # 2 fewer heterozygotes for next iteration -> add one rare, one common homozygote | |
| curr_homr += 1 | |
| curr_homc += 1 | |
| curr_hets = mid | |
| curr_homr = (rare_copies - mid) // 2 | |
| curr_homc = genotypes - curr_hets - curr_homr | |
| for curr_hets in range(mid, rare_copies - 1, 2): | |
| het_probs[curr_hets + 2] = het_probs[curr_hets] * 4.0 * curr_homr * curr_homc / ((curr_hets + 2.0) * (curr_hets + 1.0)) | |
| sum += het_probs[curr_hets + 2] | |
| #add 2 heterozygotes for next iteration -> subtract one rare, one common homozygote | |
| curr_homr -= 1 | |
| curr_homc -= 1 | |
| for i in range(0, rare_copies + 1): | |
| het_probs[i] /= sum | |
| #alternate p-value calculation for p_hi/p_lo | |
| p_hi = float(het_probs[obs_hets]) | |
| for i in range(obs_hets, rare_copies+1): | |
| p_hi += het_probs[i] | |
| p_lo = float(het_probs[obs_hets]) | |
| for i in range(obs_hets-1, -1, -1): | |
| p_lo += het_probs[i] | |
| p_hi_lo = 2.0 * p_hi if p_hi < p_lo else 2.0 * p_lo | |
| p_hwe = 0.0 | |
| # p-value calculation for p_hwe | |
| for i in range(0, rare_copies + 1): | |
| if het_probs[i] > het_probs[obs_hets]: | |
| continue; | |
| p_hwe += het_probs[i] | |
| p_hwe = 1.0 if p_hwe > 1.0 else p_hwe | |
| return p_hwe | |
| def hwe(cases, controls, output, test_name): | |
| output_filename = output + '.hwe_' + test_name | |
| print ('\nSaving HWE to:', output_filename) | |
| test_function = { | |
| 'asymptotic': Hardy_Weinberg_Equilibrium_asymptotic_test, | |
| 'exact': Hardy_Weinberg_Equilibrium_exact_test, | |
| }[test_name] | |
| with open(output_filename, 'w') as f: | |
| line_counter = 0 | |
| bar = progressbar.ProgressBar(max_value=len(cases)+1) | |
| for case, control in zip(cases, controls): | |
| line_counter += 1 | |
| if line_counter % 1000 == 0: | |
| bar.update(line_counter) | |
| to_print = [case['s_id']] | |
| homozygous_1 = (case['r'], case['r']) | |
| homozygous_2 = (case['a'], case['a']) | |
| heterozygous = (case['r'], case['a']) | |
| all_genotypes = case['g'] + control['g'] | |
| obs_hets = all_genotypes.count(heterozygous) | |
| obs_hom1 = all_genotypes.count(homozygous_1) | |
| obs_hom2 = all_genotypes.count(homozygous_2) | |
| assert obs_hets + obs_hom1 + obs_hom2 == len(all_genotypes) | |
| p_value = test_function(obs_hets, obs_hom1, obs_hom2) | |
| to_print.append(p_value) | |
| f.write(' '.join(map(str, to_print)) + '\n') | |
| return output_filename | |
| def SNP_alleles( | |
| SNP = None, | |
| missing = "0", | |
| ): | |
| ''' | |
| Get the two different alleles of a SNP. The parameter SNP is a list of tuples with genotypes. | |
| For example if SNP = [(A,A), (A,G), (G,G), (G,A)] the function returns (A,G) | |
| ''' | |
| allele_1 = None | |
| for gen in SNP: | |
| if gen[0] != missing: | |
| if not allele_1: | |
| allele_1 = gen[0] | |
| elif gen[0] != allele_1: | |
| return (allele_1, gen[0]) | |
| if gen[1] != missing: | |
| if not allele_1: | |
| allele_1 = gen[1] | |
| elif gen[1] != allele_1: | |
| return (allele_1, gen[1]) | |
| return (allele_1, allele_1) | |
| def Pairwise_linkage_disequilibrium( | |
| SNP_1 = None, | |
| SNP_2 = None, | |
| ): | |
| ''' | |
| Computes the linkage disequilibrium between two SNPs. Computes the r-squared, D' and the estimated haplotype frequencies. | |
| It has been developed to produce the same values as plink with --ld option. (http://pngu.mgh.harvard.edu/~purcell/plink/ld.shtml) | |
| Returns a dictionary with the following keys: | |
| "haplotypes" : A tuple with three values: (1) the estimated haplotype, (2) the haplotype frequencies and (3) frequencies expectation under linkage equilibrium. | |
| "R_sq" : The r-square. | |
| "Dprime" : The D prime. | |
| ''' | |
| def Minor_allele_frequency( | |
| allele1=None, | |
| allele2=None, | |
| observations_chr1 = None, | |
| observations_chr2 = None | |
| ): | |
| """ | |
| Calculates the Minor Allele Frequency of a SNP. | |
| >>> Minor_allele_frequency("A", "C", ["A", "A", "C", "C", "C"], ["A", "C", "C", "C","C"]) | |
| (0.3, 'A', 10) | |
| """ | |
| allele1_chr1 = observations_chr1.count(allele1) | |
| allele2_chr1 = observations_chr1.count(allele2) | |
| if observations_chr2: | |
| allele1_chr2 = observations_chr2.count(allele1) | |
| allele2_chr2 = observations_chr2.count(allele2) | |
| else: | |
| allele1_chr2 = 0 | |
| allele2_chr2 = 0 | |
| observations = allele1_chr1 + allele2_chr1 + allele1_chr2 + allele2_chr2 | |
| allele1_maf = float((allele1_chr1 + allele1_chr2)) / observations | |
| if allele1_maf < 0.5: | |
| return (allele1_maf, allele1, observations) | |
| else: | |
| return (1.0 - allele1_maf, allele2, observations) | |
| ret = {} | |
| (SNP_1_allele_1, SNP_1_allele_2) = SNP_alleles(SNP_1) | |
| (SNP_2_allele_1, SNP_2_allele_2) = SNP_alleles(SNP_2) | |
| Haplotype_1_1 = [x[0] for x in SNP_1] | |
| Haplotype_1_2 = [x[1] for x in SNP_1] | |
| Haplotype_2_1 = [x[0] for x in SNP_2] | |
| Haplotype_2_2 = [x[1] for x in SNP_2] | |
| MAF_1 = Minor_allele_frequency(SNP_1_allele_1, SNP_1_allele_2, Haplotype_1_1, Haplotype_1_2) | |
| MAF_2 = Minor_allele_frequency(SNP_2_allele_1, SNP_2_allele_2, Haplotype_2_1, Haplotype_2_2) | |
| if MAF_1[1] == SNP_1_allele_1: | |
| freq_1_1 = MAF_1[0] | |
| freq_1_2 = 1.0 - freq_1_1 | |
| elif MAF_1[1] == SNP_1_allele_2: | |
| freq_1_2 = MAF_1[0] | |
| freq_1_1 = 1.0 - freq_1_2 | |
| if MAF_2[1] == SNP_2_allele_1: | |
| freq_2_1 = MAF_2[0] | |
| freq_2_2 = 1.0 - freq_2_1 | |
| elif MAF_2[1] == SNP_2_allele_2: | |
| freq_2_2 = MAF_2[0] | |
| freq_2_1 = 1.0 - freq_2_2 | |
| freq_A_B = freq_1_1 * freq_2_1 | |
| freq_A_b = freq_1_1 * freq_2_2 | |
| freq_a_B = freq_1_2 * freq_2_1 | |
| freq_a_b = freq_1_2 * freq_2_2 | |
| Haps = list(zip(Haplotype_1_1, Haplotype_1_2, Haplotype_2_1, Haplotype_2_2)) | |
| N11 = len([1 for h11, h12, h21, h22 in Haps if h11 == SNP_1_allele_1 and h12 == SNP_1_allele_1 and h21 == SNP_2_allele_1 and h22 == SNP_2_allele_1]) | |
| N21 = len([1 for h11, h12, h21, h22 in Haps if ((h11 == SNP_1_allele_1 and h12 == SNP_1_allele_2) or (h11 == SNP_1_allele_2 and h12 == SNP_1_allele_1)) and h21 == SNP_2_allele_1 and h22 == SNP_2_allele_1]) | |
| N31 = len([1 for h11, h12, h21, h22 in Haps if h11 == SNP_1_allele_2 and h12 == SNP_1_allele_2 and h21 == SNP_2_allele_1 and h22 == SNP_2_allele_1]) | |
| N12 = len([1 for h11, h12, h21, h22 in Haps if h11 == SNP_1_allele_1 and h12 == SNP_1_allele_1 and ((h21 == SNP_2_allele_1 and h22 == SNP_2_allele_2) or (h21 == SNP_2_allele_2 and h22 == SNP_2_allele_1))]) | |
| N22 = len([1 for h11, h12, h21, h22 in Haps if ((h11 == SNP_1_allele_1 and h12 == SNP_1_allele_2) or (h11 == SNP_1_allele_2 and h12 == SNP_1_allele_1)) and ((h21 == SNP_2_allele_1 and h22 == SNP_2_allele_2) or (h21 == SNP_2_allele_2 and h22 == SNP_2_allele_1))]) | |
| N32 = len([1 for h11, h12, h21, h22 in Haps if h11 == SNP_1_allele_2 and h12 == SNP_1_allele_2 and ((h21 == SNP_2_allele_1 and h22 == SNP_2_allele_2) or (h21 == SNP_2_allele_2 and h22 == SNP_2_allele_1))]) | |
| N13 = len([1 for h11, h12, h21, h22 in Haps if h11 == SNP_1_allele_1 and h12 == SNP_1_allele_1 and h21 == SNP_2_allele_2 and h22 == SNP_2_allele_2]) | |
| N23 = len([1 for h11, h12, h21, h22 in Haps if ((h11 == SNP_1_allele_1 and h12 == SNP_1_allele_2) or (h11 == SNP_1_allele_2 and h12 == SNP_1_allele_1)) and h21 == SNP_2_allele_2 and h22 == SNP_2_allele_2]) | |
| N33 = len([1 for h11, h12, h21, h22 in Haps if h11 == SNP_1_allele_2 and h12 == SNP_1_allele_2 and h21 == SNP_2_allele_2 and h22 == SNP_2_allele_2]) | |
| N1_ = N11 + N12 + N13 | |
| N2_ = N21 + N22 + N23 | |
| N3_ = N31 + N32 + N33 | |
| N_1 = N11 + N21 + N31 | |
| N_2 = N12 + N22 + N32 | |
| N_3 = N13 + N23 + N33 | |
| N = N1_ + N2_ + N3_ | |
| pAB = 0.25 | |
| pab = 0.25 | |
| pAb = 0.25 | |
| paB = 0.25 | |
| for nn in range(10): # 10 should be enough | |
| nAB = (2*N11) + N12 + N21 + (( (pAB*pab) / ((pAb*paB) + (pAB*pab)) ) * N22) | |
| nab = (2*N33) + N23 + N32 + (( (pAB*pab) / ((pAb*paB) + (pAB*pab)) ) * N22) | |
| nAb = (2*N13) + N12 + N23 + (( (pAb*paB) / ((pAb*paB) + (pAB*pab)) ) * N22) | |
| naB = (2*N31) + N21 + N32 + (( (pAb*paB) / ((pAb*paB) + (pAB*pab)) ) * N22) | |
| pAB = nAB / (2.0 * N) | |
| pab = nab / (2.0 * N) | |
| pAb = nAb / (2.0 * N) | |
| paB = naB / (2.0 * N) | |
| ret["haplotypes"] = [ | |
| (SNP_1_allele_1 + SNP_2_allele_1, pAB, freq_A_B), | |
| (SNP_1_allele_1 + SNP_2_allele_2, pAb, freq_A_b), | |
| (SNP_1_allele_2 + SNP_2_allele_1, paB, freq_a_B), | |
| (SNP_1_allele_2 + SNP_2_allele_2, pab, freq_a_b), | |
| ] | |
| Observed = [pAB, pAb, paB, pab] | |
| Expected = [freq_A_B, freq_A_b, freq_a_B, freq_a_b] | |
| R_sq = sum([((o-e)*(o-e))/e for o,e in zip(Observed, Expected)]) | |
| ret["R_sq"] = R_sq | |
| pA = pAB + pAb | |
| pB = pAB + paB | |
| D = pAB - (pA*pB) | |
| if D>=0: | |
| Dmax = min(pA * (1-pB), (1-pA)*pB) | |
| else: | |
| Dmax = min(pA * pB, (1-pA)*(1-pB)) | |
| Dprime = D / Dmax | |
| ret["Dprime"] = Dprime | |
| return ret | |
| def find_snp_index_from_data(snp_id, data): | |
| # THIS IS SLOW!!! | |
| # We can improve it! | |
| for index, d in enumerate(cases): | |
| if d['s_id'] == snp_id: | |
| return index | |
| return None | |
| def ld(cases, controls, snp_1, snp_2, output): | |
| print ('SNP 1:', snp_1) | |
| print ('SNP 2:', snp_2) | |
| output_filename = output + '.ld' | |
| print ('Saving LD to:', output_filename) | |
| snp_1_index = find_snp_index_from_data(snp_1, cases) | |
| snp_2_index = find_snp_index_from_data(snp_2, cases) | |
| snp_1_data = cases[snp_1_index]['g'] + controls[snp_1_index]['g'] | |
| snp_2_data = cases[snp_2_index]['g'] + controls[snp_2_index]['g'] | |
| ld_result = Pairwise_linkage_disequilibrium(snp_1_data, snp_2_data) | |
| with open(output_filename, 'w') as f: | |
| to_print = [snp_1, snp_2, | |
| ld_result['R_sq'], | |
| ld_result['Dprime'], | |
| ] + ["{}:{},{}".format(x[0], x[1], x[2]) for x in ld_result['haplotypes']] | |
| f.write(' '.join(map(str, to_print)) + '\n') | |
| def genetic_association_test(SNPs, phenotypes, tests): | |
| ''' | |
| For testing make sure that this method returns the same values as plink | |
| >>> SNPs = [('A','A'), ('A','G'), ('G','G'), ('G','A'), ('G', 'A'), ('A', 'G'), ('G', 'G'), ('A', 'A'), ('G', 'A'), ('A', 'A'), ('A','A'), ('A','G'), ('G','G'), ('G','A'), ('G', 'A'), ('A', 'G'), ('G', 'G'), ('A', 'A'), ('G', 'A'), ('A', 'A')] | |
| >>> phenotypes = [1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1] | |
| >>> tests = ['GENO', 'TREND', 'ALLELIC', 'DOM', 'REC'] # All of them | |
| >>> results = genetic_association_test(SNPs, phenotypes, tests) | |
| >>> ' '.join([' '.join(['%0.4f' % results[t]['CHISQ'], '%0.4f' % results[t]['P']]) for t in tests]) | |
| '0.2176 0.6409 0.2198 0.6392 0.5128 0.7738 0.0105 0.9185 0.4945 0.4819' | |
| ''' | |
| #Assert that phenotypes and genotypes have the same length | |
| assert len(SNPs) == len(phenotypes) | |
| #Get the two different alleles of this SNP | |
| allele_1, allele_2 = SNP_alleles(SNPs) | |
| #Genotype Contingency table: | |
| n = np.zeros((2 + 1, 3 + 1)) | |
| #Allelic Contingency table: | |
| m = np.zeros((2 + 1, 2 + 1)) | |
| #Expected values for Genotype Contingency table: | |
| En = np.zeros((2 + 1, 3 + 1)) | |
| #Expected values for Allelic Contingency table | |
| Em = np.zeros((2 + 1, 2 + 1)) | |
| #Total values for Genotype Contingency table | |
| nS3 = np.zeros(3 + 1) | |
| n2S = np.zeros(2 + 1) | |
| #Total values for Allelic Contingency table | |
| m2S = np.zeros(2 + 1) | |
| mS2 = np.zeros(2 + 1) | |
| samples = float(len(SNPs)) | |
| #Build Contingency tables | |
| for SNP, phenotype in zip(SNPs, phenotypes): | |
| count_allele_1 = SNP.count(allele_1) | |
| count_allele_2 = SNP.count(allele_2) | |
| allele_1_1 = 1 if SNP == (allele_1, allele_1) else 0 | |
| allele_1_2 = 1 if allele_1 in SNP and allele_2 in SNP else 0 | |
| allele_2_2 = 1 if SNP == (allele_2, allele_2) else 0 | |
| mS2[1] += count_allele_1 | |
| mS2[2] += count_allele_2 | |
| m2S[phenotype] += 1 | |
| m[phenotype][1] += count_allele_1 | |
| m[phenotype][2] += count_allele_2 | |
| nS3[1] += allele_1_1 | |
| nS3[2] += allele_1_2 | |
| nS3[3] += allele_2_2 | |
| n2S[phenotype] += 1 | |
| n[phenotype][1] += allele_1_1 | |
| n[phenotype][2] += allele_1_2 | |
| n[phenotype][3] += allele_2_2 | |
| results = {} | |
| cochran_armitrage_tests = ['TREND', 'DOM', 'REC'] | |
| for test in tests: | |
| if test == 'ALLELIC': | |
| chi_square = 0.0 | |
| for i in range(1,3): | |
| for j in range(1,4): | |
| En[i][j] = (n2S[i] * nS3[j]) / samples | |
| if En[i][j] > 0.0: | |
| chi_square += ((n[i][j] - En[i][j]) ** 2) / En[i][j] | |
| p_value = stats.chisqprob(chi_square, 2) | |
| elif test == 'GENO': | |
| chi_square = 0.0 | |
| for i in range(1,3): | |
| for j in range(1,3): | |
| Em[i][j] = (m2S[i] * mS2[j]) / samples | |
| if Em[i][j] > 0.0: | |
| chi_square += ((m[i][j] - Em[i][j]) ** 2) / Em[i][j] | |
| p_value = stats.chisqprob(chi_square, 1) | |
| elif test in cochran_armitrage_tests: | |
| w = [[0,0,1,2], [0,0,1,1], [0,0,0,1]][cochran_armitrage_tests.index(test)] | |
| #Nominator | |
| nom = 0.0 | |
| for i in range(1,4): | |
| nom += w[i] * (n[1][i] * n2S[2] - n[2][i] * n2S[1]) | |
| nom = nom ** 2 | |
| #First part of denominator | |
| den1 = 0.0 | |
| for i in range(1,4): | |
| den1 += (w[i]**2) * nS3[i] * (samples - nS3[i]) | |
| #Second part of denominator | |
| den2 = 0.0 | |
| for i in range(1,3): | |
| for j in range(i+1,4): | |
| den2 += w[i] * w[j] * nS3[i] * nS3[j] | |
| den2 = 2.0 * den2 | |
| #Third part of denominator | |
| den3 = (n2S[1] * n2S[2]) / samples | |
| #The denominator | |
| den = den3 * (den1 - den2) | |
| chi_square = nom / den | |
| p_value = stats.chisqprob(chi_square, 1) | |
| else: | |
| raise Exception('Unknown test name: "%s"' % (str(test))) | |
| results[test] = {'CHISQ': chi_square, 'P' : p_value} | |
| return results | |
| def association_test(cases, controls, output, test_names): | |
| output_filename = output + '.association' | |
| print ('\nSaving significance testing to file:', output_filename) | |
| if not test_names: | |
| tests = ['GENO'] | |
| else: | |
| tests = list(set(test_names)) | |
| print ('Performing the following tests: ', ' '.join(tests)) | |
| with open(output_filename, 'w') as f: | |
| header = ['snp_id', 'location'] | |
| for test_name in tests: | |
| header.extend([test_name + '_pvalue', test_name + '_chisquare']) | |
| #Write header | |
| f.write(' '.join(header) + '\n') | |
| bar = progressbar.ProgressBar(max_value=len(cases)+1) | |
| line_counter = 0 | |
| for case, control in zip(cases, controls): | |
| line_counter += 1 | |
| if line_counter % 1000 == 0: | |
| bar.update(line_counter) | |
| genotypes = case['g'] + control['g'] | |
| phenotypes = [2] * len(case['g']) + [1] * len(control['g']) | |
| test_result = genetic_association_test(genotypes, phenotypes, tests) | |
| to_print = [case['s_id'], case['l']] | |
| for test_name in tests: | |
| to_print.extend([test_result[test_name]['P'], test_result[test_name]['CHISQ']]) | |
| f.write(' '.join(map(str, to_print)) + '\n') | |
| def plot_manhattan(location, p_value, output_filename, title='Manhattan plot'): | |
| plt.plot(location, -np.log(p_value), '.', markersize=5, color='black') | |
| plt.xlabel('Location') | |
| plt.ylabel('-log(p)') | |
| plt.title(title) | |
| #Plot significance threshold as a red line | |
| plt.plot([location[0], location[-1]], [6, 6], color='red') | |
| plt.xlim(location[0], location[-1]) | |
| png = output_filename + '.png' | |
| print ('\nSaving to:', png) | |
| plt.savefig(png) | |
| plt.show() | |
| def ppoints(n): | |
| ''' | |
| Same as R's ppoints | |
| http://stat.ethz.ch/R-manual/R-patched/library/stats/html/ppoints.html | |
| Also check: http://en.wikipedia.org/wiki/Q%E2%80%93Q_plot#Heuristics | |
| ''' | |
| if n <= 10: | |
| a = 3.0/8.0 | |
| else: | |
| a = 0.5 | |
| return (np.array(range(1,n+1)) - a) / (n + 1 - 2.0*a) | |
| def plot_qqplot(p_values, output_filename, title='QQ plot'): | |
| ''' | |
| Create a qq-plot for p values from significance testing. | |
| ''' | |
| observed = np.sort(p_values) | |
| observed = -np.log10(observed) | |
| expected = -np.log10(ppoints(len(observed))) | |
| x_label = 'expected -log(10)' | |
| y_label = 'observed -log(10)' | |
| l = np.median(observed)/-np.log10(0.5) | |
| xmax = max(expected[np.invert(np.isinf(expected))]) + 1 | |
| ymax = max(observed[np.invert(np.isinf(observed))]) + 1 | |
| xymax = max(xmax, ymax) | |
| fig, ax = plt.subplots() | |
| #Set labels | |
| ax.set_xlabel(x_label) | |
| ax.set_ylabel(y_label) | |
| ax.set_title(title) | |
| #plot ab line | |
| ax.plot([-0.2, xymax], [-0.2, xymax], color='red', linewidth=2.0) | |
| #Plot qq plot | |
| ax.plot(expected, observed, '.') | |
| png = output_filename + '.png' | |
| print ('\nSaving plot to:', png) | |
| plt.savefig(png) | |
| plt.show() | |
| def read_pvalues(filename, test_name): | |
| locations = [] | |
| pvalues = [] | |
| with open(filename) as f: | |
| header = f.readline().replace('\n', '').split() | |
| p_column_name = test_name + '_pvalue' | |
| if not p_column_name in header: | |
| raise Exception('Column {} does not exit in file: {}'.format(p_column_name, filename)) | |
| p_column_index = header.index(p_column_name) | |
| location_index = header.index('location') | |
| bar = progressbar.ProgressBar(max_value=progressbar.UnknownLength) | |
| line_counter = 0 | |
| for line in f: | |
| line_counter += 1 | |
| if line_counter % 10000 == 0: | |
| bar.update(line_counter) | |
| line_splitted = line.replace('\n', '').split() | |
| location = int(line_splitted[location_index]) | |
| p_value = float(line_splitted[p_column_index]) | |
| locations.append(location) | |
| pvalues.append(p_value) | |
| return locations, pvalues | |
| def manhattan(options, output): | |
| manhattan_filename = options[0] | |
| manhattan_test = options[1] | |
| locations, pvalues = read_pvalues(manhattan_filename, manhattan_test) | |
| plot_manhattan(locations, pvalues, output + '.'+ manhattan_test + '.manhattan', | |
| title='{} Manhattan plot'.format(manhattan_test)) | |
| def qqplot(options, output): | |
| qq_filename = options[0] | |
| qq_test = options[1] | |
| locations, pvalues = read_pvalues(qq_filename, qq_test) | |
| plot_qqplot(pvalues, | |
| output_filename = output + '.' + qq_test + '.qqplot', | |
| title = '{} QQ plot'.format(qq_test)) | |
| def parse_vep_response(response): | |
| ret = {} | |
| for r in response: | |
| if 'transcript_consequences' in r: | |
| for transcript_consequence in r['transcript_consequences']: | |
| transcript_id = transcript_consequence['transcript_id'] | |
| ret[transcript_id] = {} | |
| if 'polyphen_prediction' in transcript_consequence: | |
| ret[transcript_id]['polyphen_prediction'] = transcript_consequence['polyphen_prediction'] | |
| if 'sift_prediction' in transcript_consequence: | |
| ret[transcript_id]['sift_prediction'] = transcript_consequence['sift_prediction'] | |
| return ret | |
| def hgvs_formatter(chromosome, location, reference, alternative): | |
| return "{}:g.{}{}>{}".format(chromosome, location, reference, alternative) | |
| def get_info_hgvs(hgvs_l): | |
| ''' | |
| POST request | |
| ''' | |
| url = 'http://rest.ensembl.org/vep/human/hgvs' | |
| headers = { "Content-Type" : "application/json", "Accept" : "application/json"} | |
| data = { "hgvs_notations" : hgvs_l} | |
| data_json = json.dumps(data) | |
| r = requests.post(url, headers=headers, data=data_json) | |
| if not r.ok: | |
| return None | |
| data = r.json() | |
| return parse_vep_response(data) | |
| def get_info_rs(rs_variant_l): | |
| ''' | |
| POST request | |
| ''' | |
| url = 'http://rest.ensembl.org/vep/human/id' | |
| headers = { "Content-Type" : "application/json", "Accept" : "application/json"} | |
| data = { "ids" : rs_variant_l } | |
| data_json = json.dumps(data) | |
| r = requests.post(url, headers=headers, data=data_json) | |
| if not r.ok: | |
| return None | |
| data = r.json() | |
| #print (data) | |
| return parse_vep_response(data) | |
| def print_get_info_results(get_info_results): | |
| ''' | |
| Nicely pring get_info results | |
| ''' | |
| print (json.dumps(get_info_results, indent=4)) | |
| def is_rs(variant): | |
| return re.match(r'rs[\d]+', variant) | |
| def get_info(cases, controls, variants_index): | |
| ret = {} | |
| total = len(variants_index) | |
| bar = progressbar.ProgressBar(max_value=len(variants_index)) | |
| for variant_counter, variant_index in enumerate(variants_index): | |
| bar.update(variant_counter) | |
| #Check which variants are rs and which are hgvs | |
| rs_id = cases[variant_index]['r_id'] | |
| #print ('Running get_info for: {} {}/{}'.format(rs_id, variant_counter+1, total)) | |
| if is_rs(rs_id): | |
| get_info_results = get_info_rs([rs_id]) | |
| else: | |
| s = re.match(r'([\d]+)-([\d]+)', rs_id) | |
| if not s: | |
| raise Exception('Could not parse rs_id: {}'.format(rs_id)) | |
| chromosome = s.group(1) | |
| location = int(s.group(2)) | |
| liftovered_location = do_liftover(chromosome, location) | |
| reference = cases[variant_index]['r'] | |
| alternative = cases[variant_index]['a'] | |
| this_hgvs = hgvs_formatter(chromosome, liftovered_location, reference, alternative) | |
| get_info_results = get_info_hgvs([this_hgvs]) | |
| #Check returned data | |
| if get_info_results == {}: | |
| get_info_results = 'WARNING!!!! get_info did not return consequences information for {}'.format(rs_id) | |
| elif get_info_results is None: | |
| get_info_results = 'WARNING!!!! get_info failed for variant: {}'.format(rs_id) | |
| else: | |
| ret[variant_index] = get_info_results | |
| #print_get_info_results(results) | |
| return ret | |
| def do_liftover(chromosome, position): | |
| if not lo: | |
| return position | |
| lo_result = lo.convert_coordinate('chr{}'.format(chromosome), position) | |
| if not lo_result: | |
| print ('WARNING!! Could not liftover: {}:{}'.format(chromosome, position)) | |
| return None | |
| if len(lo_result) > 1: | |
| # Is this possible??? | |
| print ('WARNING! Liftover returned more than one positions for {}:{}'.format(chromosome, position)) | |
| return lo_result[0][1] | |
| def setup_liftover(from_assembly, to_assembly): | |
| global lo | |
| print ('Setting up liftover..') | |
| lo = LiftOver(from_assembly, to_assembly) | |
| print ('...Done') | |
| def plot_distribution(data, bins=50, xlabel='', ylabel='', title='', output_filename=None): | |
| ''' | |
| http://matplotlib.org/1.2.1/examples/pylab_examples/histogram_demo.html | |
| ''' | |
| n, bins, patches = plt.hist(data, bins) | |
| plt.xlabel(xlabel) | |
| plt.ylabel(ylabel) | |
| plt.title(title) | |
| if output_filename: | |
| png = output_filename + '.png' | |
| print ('Saving to ', png) | |
| plt.savefig(png) | |
| plt.show() | |
| def maf_from_frequency_file(filename): | |
| frequencies = [] | |
| print ('Reading file:', filename) | |
| with open(filename) as f: | |
| for line in f: | |
| line_splitted = line.replace('\n', '').split() | |
| fr_ref = float(line_splitted[-2]) | |
| fr_alt = float(line_splitted[-1]) | |
| maf = min(fr_ref, fr_alt) | |
| frequencies.append(maf) | |
| return frequencies | |
| def p_from_hwe_file(filename): | |
| print ('Reading file:', filename) | |
| all_p = [] | |
| with open(filename) as f: | |
| for line in f: | |
| line_splitted = line.replace('\n', '').split() | |
| p = float(line_splitted[-1]) | |
| all_p.append(p) | |
| return all_p | |
| def allele_frequency_plot(filename): | |
| ''' | |
| Plots the distribution of an allele frequency file | |
| ''' | |
| frequencies = maf_from_frequency_file(filename) | |
| plot_distribution(frequencies, | |
| bins=100, | |
| xlabel='Minor Allele Frequencies', | |
| ylabel='Counts', | |
| output_filename = filename) | |
| def hwe_plot(filename): | |
| all_p = p_from_hwe_file(filename) | |
| plot_distribution(all_p, | |
| bins=100, | |
| xlabel='HWE p-values', | |
| ylabel='Counts', | |
| output_filename=filename) | |
| def maf_filter(cases, controls, output, maf_filter): | |
| print ('\nFiltering with MAF>{}'.format(maf_filter)) | |
| print ('Calculate frequencies..') | |
| frequencies_filename = allele_frequency(cases, controls, output) | |
| print ('Read file with frequencies..') | |
| frequencies = maf_from_frequency_file(frequencies_filename) | |
| print ('Apply filter..') | |
| removed = 0 | |
| total = 0 | |
| with open('remove_snps.tmp', 'w') as f: | |
| for case, frequency in zip(cases, frequencies): | |
| total += 1 | |
| if frequency<maf_filter: | |
| removed += 1 | |
| f.write(case['s_id'] + '\n') | |
| print ('Removed: {} out from {} SNPs'.format(removed, total)) | |
| print ('Create new data..') | |
| remove_snps(cases, controls, output, 'remove_snps.tmp') | |
| def hwe_filter(cases, controls, output, hwe_filter, test): | |
| print ('\nFiltering with HWE>{}'.format(hwe_filter)) | |
| print ('Calculate hwe..') | |
| hwe_filename = hwe(cases, controls, output, test) | |
| print ('\nReading file with p-values: {}'.format(hwe_filename)) | |
| all_p = p_from_hwe_file(hwe_filename) | |
| print ('Apply filter..') | |
| removed = 0 | |
| total = 0 | |
| with open('remove_snps.tmp', 'w') as f: | |
| for case, p in zip(cases, all_p): | |
| total += 1 | |
| if p < hwe_filter: | |
| removed += 1 | |
| f.write(case['s_id'] + '\n') | |
| print ('Removed: {} out from {} SNPs'.format(removed, total)) | |
| print ('Create new data..') | |
| remove_snps(cases, controls, output, 'remove_snps.tmp') | |
| def p_value_generator(filename, test_name): | |
| ''' | |
| This is a generator: https://wiki.python.org/moin/Generators | |
| ''' | |
| print ('\nOpening file:', filename) | |
| with open(filename) as f: | |
| header = f.readline().replace('\n', '').split() | |
| p_column_name = test_name + '_pvalue' | |
| if not p_column_name in header: | |
| raise Exception('{} column does not exist in {}'.format(p_column_name, filename)) | |
| p_column_index = header.index(p_column_name) | |
| line_counter = 0 | |
| for line in f: | |
| line_counter += 1 | |
| if line_counter % 10000 == 0: | |
| pass | |
| #print ('Line counter: ', line_counter) | |
| line_splitted = line.replace('\n', '').split() | |
| p_value = float(line_splitted[p_column_index]) | |
| #yield line_counter, p_value | |
| yield p_value | |
| def find_peaks(data, window_size, significance_level): | |
| peaks = [] | |
| for i in range(0, len(data), window_size): | |
| window = -np.log(data[i : i+window_size]) | |
| window_index = i + int(np.argmax(window)) # The int() part is to get over with this issue: https://bugs.python.org/issue24313 | |
| # Yep.. even python has its quirks.. | |
| window_max = np.max(window) | |
| if window_max < significance_level: | |
| continue # This is not a peak | |
| #There is an interesting peak. # Keep it! | |
| peaks.append({'index': window_index, 'max': window_max}) | |
| return peaks | |
| def get_data_point(cases, controls, index): | |
| return cases[index]['g'] + controls[index]['g'] | |
| def disequilibrium_criterion(ld_result, dprime_threshod=0.98, r_sq_threshold=0.2): | |
| ''' | |
| http://www.nature.com/nrg/journal/v4/n8/full/nrg1123.html | |
| ''' | |
| if np.abs(ld_result['Dprime']) > dprime_threshod: | |
| return True | |
| if ld_result['R_sq'] > r_sq_threshold: | |
| return True | |
| return False | |
| def get_independent_peaks(cases, controls, peaks): | |
| iterations = 0 | |
| while True: | |
| iterations += 1 | |
| new_peaks = [] | |
| change_happened = False | |
| for peak_index in range(0, len(peaks), 2): | |
| peak_1 = peaks[peak_index] | |
| if peak_index+1 == len(peaks): | |
| continue | |
| peak_2 = peaks[peak_index+1] | |
| distance = peak_2['index'] - peak_1['index'] | |
| genotypes_1 = get_data_point(cases, controls, peak_1['index']) | |
| genotypes_2 = get_data_point(cases, controls, peak_2['index']) | |
| ld_result = Pairwise_linkage_disequilibrium(genotypes_1, genotypes_2) | |
| are_in_LD = disequilibrium_criterion(ld_result) | |
| if are_in_LD: | |
| new_peaks.append(peak_1) | |
| change_happened = True | |
| else: | |
| new_peaks.append(peak_1) | |
| new_peaks.append(peak_2) | |
| print ('Iteration: {}. Peaks: {}'.format(iterations, len(new_peaks))) | |
| peaks = new_peaks | |
| if not change_happened: | |
| # Our work here is done | |
| break | |
| return peaks | |
| def regional_ld_snps(cases, control, index): | |
| ret = [] | |
| peak_genotypes = get_data_point(cases, controls, index) | |
| for direction in [-1, +1]: | |
| #print ('Direction: {}'.format({-1:'-', 1:'+'}[direction])) | |
| compare_with = index + direction | |
| last_snp_in_ld = index | |
| while True: | |
| compare_with_genotype = get_data_point(cases, controls, compare_with) | |
| ld_result = Pairwise_linkage_disequilibrium(peak_genotypes, compare_with_genotype) | |
| if disequilibrium_criterion(ld_result): | |
| #print ('{} - {} LD!'.format(peak_counter, compare_with)) | |
| ret.append(compare_with) | |
| last_snp_in_ld = compare_with | |
| else: | |
| #What is the distance between this and the last distance in LD? | |
| distance = np.abs(compare_with - last_snp_in_ld) | |
| #print ('{} - {} NOT LD. DISTANCE: {}'.format(peak_counter, compare_with, distance)) | |
| if distance > 100: # 1000 returned the same results. | |
| break | |
| compare_with += direction # move along | |
| return ret | |
| def independent(cases, controls, output, independent_arg): | |
| ''' | |
| This function takes as input the result if a statistical test analysis | |
| and locates the independent signals | |
| ''' | |
| filename = independent_arg[0] | |
| test_name = independent_arg[1] | |
| p_values = list(p_value_generator(filename, test_name)) | |
| peaks = find_peaks(p_values, 100, 8) | |
| print ('Found {} potential interesting significance peaks!'.format(len(peaks))) | |
| ind_peaks = get_independent_peaks(cases, controls, peaks) | |
| print ('Independent peaks: {}'.format(len(ind_peaks))) | |
| peak_lds = {} | |
| for peak_counter, peak in enumerate(ind_peaks): | |
| print ('Ivestigating peak: {}'.format(peak_counter+1)) | |
| peak_index = peak['index'] | |
| peak_lds[peak_index] = regional_ld_snps(cases, controls, peak_index) | |
| print ('Peak {}. Found {} SNPs in LD'.format(peak_counter, len(peak_lds[peak_index]))) | |
| output_filename = output + '.independent' | |
| print ('Saving results to {}'.format(output_filename)) | |
| with open(output_filename, 'w') as f: | |
| json.dump(peak_lds, f, indent=4) | |
| def get_info_independent(cases, controls, output, input_filename): | |
| print ('Reading:', input_filename) | |
| to_save = {} | |
| with open(input_filename) as f: | |
| peak_lds = json.load(f) | |
| peak_lds = {int(k):v for k,v in peak_lds.items()} # JSON stores keys as strings.. | |
| for peak_counter, peak in enumerate(peak_lds): | |
| print ('\nGetting info for peak: {}/{}'.format(peak_counter+1, len(peak_lds))) | |
| peak_id = cases[peak]['r_id'] | |
| peak_location = cases[peak]['l'] | |
| peak_info = get_info(cases, controls, peak_lds[peak]) | |
| to_save[peak_id] = { | |
| 'location': peak_location, | |
| 'info': peak_info, | |
| } | |
| output_filename = output + '.get_info' | |
| print ('\nSaving results to:', output_filename) | |
| with open(output_filename, 'w') as f: | |
| json.dump(to_save, f, indent=4) | |
| if __name__ == '__main__': | |
| ''' | |
| Example default run | |
| /Users/alexandroskanterakis/anaconda3/bin/python project.py --controls_file gwas.controls.gen --cases_file gwas.cases.gen --output output | |
| /Users/alexandroskanterakis/anaconda3/bin/python project.py --controls_file gwas.controls.gen --cases_file gwas.cases.gen --output output --keep_snps example.keep.snps | |
| /Users/alexandroskanterakis/anaconda3/bin/python project.py --controls_file gwas.controls.gen --cases_file gwas.cases.gen --output output --remove_snps example.keep.snp | |
| /Users/alexandroskanterakis/anaconda3/bin/python project.py --controls_file gwas.controls.gen --cases_file gwas.cases.gen --output output --keep_samples example.keep.samples | |
| /Users/alexandroskanterakis/anaconda3/bin/python project.py --controls_file gwas.controls.gen --cases_file gwas.cases.gen --output output --remove_samples example.keep.samples | |
| time /Users/alexandroskanterakis/anaconda3/bin/python project.py --controls_file gwas.controls.gen --cases_file gwas.cases.gen --output output --allele_frequency | |
| time /Users/alexandroskanterakis/anaconda3/bin/python project.py --allele_frequency_plot output.frequency | |
| time /Users/alexandroskanterakis/anaconda3/bin/python project.py --controls_file gwas.controls.gen --cases_file gwas.cases.gen --output output_maf005 --maf_filter 0.05 | |
| time /Users/alexandroskanterakis/anaconda3/bin/python project.py --controls_file gwas.controls.gen --cases_file gwas.cases.gen --output output --hwe_asymptotic | |
| time /Users/alexandroskanterakis/anaconda3/bin/python project.py --controls_file gwas.controls.gen --cases_file gwas.cases.gen --output output --hwe_exact | |
| time /Users/alexandroskanterakis/anaconda3/bin/python project.py --hwe_plot output_asymptotic.hwe_asymptotic | |
| time /Users/alexandroskanterakis/anaconda3/bin/python project.py --controls_file gwas.controls.gen --cases_file gwas.cases.gen --output output_hwe_asymptotic_0001 --hwe_asymptotic_filter 0.05 | |
| time /Users/alexandroskanterakis/anaconda3/bin/python project.py --controls_file gwas.controls.gen --cases_file gwas.cases.gen --output output_hwe_exact_0001 --hwe_exact_filter 0.05 | |
| time /Users/alexandroskanterakis/anaconda3/bin/python project.py --controls_file gwas.controls.gen --cases_file gwas.cases.gen --output output --ld snp_5 snp_7 | |
| time /Users/alexandroskanterakis/anaconda3/bin/python project.py --controls_file gwas.controls.gen --cases_file gwas.cases.gen --output output --association_test | |
| time /Users/alexandroskanterakis/anaconda3/bin/python project.py --controls_file gwas.controls.gen --cases_file gwas.cases.gen --output output --manhattan output.association GENO | |
| time /Users/alexandroskanterakis/anaconda3/bin/python project.py --controls_file gwas.controls.gen --cases_file gwas.cases.gen --output output --qqplot output.association GENO | |
| time /Users/alexandroskanterakis/anaconda3/bin/python project.py --controls_file gwas.controls.gen --cases_file gwas.cases.gen --output output --liftover hg18 hg38 --get_info rs4814683 | |
| time /Users/alexandroskanterakis/anaconda3/bin/python project.py --controls_file gwas.controls.gen --cases_file gwas.cases.gen --output output --liftover hg18 hg38 --get_info snp_1 | |
| ''' | |
| argparser = argparse.ArgumentParser(description='Python project https://gist.github.com/kantale/24f4f02d5b87ce0146e0455791e71aeb ') | |
| argparser.add_argument('--controls_file', default=None, required = False) | |
| argparser.add_argument('--cases_file', default=None, required=False) | |
| argparser.add_argument('--output') | |
| argparser.add_argument('--keep_snps', default=None, required = False) | |
| argparser.add_argument('--remove_snps', default=None, required = False) | |
| argparser.add_argument('--keep_samples', default=None, required=False) | |
| argparser.add_argument('--remove_samples', default=None, required=False) | |
| argparser.add_argument('--allele_frequency', action='store_true', required=False) | |
| argparser.add_argument('--allele_frequency_plot', required=False) | |
| argparser.add_argument('--maf_filter', default=None, type=float, required=False) | |
| argparser.add_argument('--hwe_asymptotic', action='store_true', required=False) | |
| argparser.add_argument('--hwe_exact', action='store_true', required=False) | |
| argparser.add_argument('--hwe_plot', default=None, required=False) | |
| argparser.add_argument('--hwe_asymptotic_filter', default=None, type=float, required=False) | |
| argparser.add_argument('--hwe_exact_filter', default=None, type=float, required=False) | |
| argparser.add_argument('--ld', nargs=2, required=False) | |
| argparser.add_argument('--association_test', nargs='*', required=False) | |
| argparser.add_argument('--manhattan', nargs=2, required=False) | |
| argparser.add_argument('--qqplot', nargs=2, required=False) | |
| argparser.add_argument('--get_info', default=None, required=False) | |
| argparser.add_argument('--liftover', nargs=2, required=False) | |
| argparser.add_argument('--independent', nargs=2, default=None, required=False) | |
| argparser.add_argument('--get_info_independent', default=None, required=False) | |
| options = argparser.parse_args() | |
| cases = None | |
| controls = None | |
| output = options.output | |
| if not options.controls_file is None and not options.cases_file is None: | |
| cases, controls = read_input(options) | |
| #Setup liftover object | |
| if options.liftover: | |
| print ('\nSetting up liftover object from:{} to:{}'.format(*options.liftover)) | |
| setup_liftover(options.liftover[0], options.liftover[1]) | |
| if not options.keep_snps is None: | |
| keep_snps(cases, controls, output, options.keep_snps) | |
| if not options.remove_snps is None: | |
| remove_snps(cases, controls, output, options.remove_snps) | |
| if not options.keep_samples is None: | |
| keep_samples(cases, controls, output, options.keep_samples) | |
| if not options.remove_samples is None: | |
| remove_samples(cases, controls, output, options.remove_samples) | |
| if options.allele_frequency: | |
| allele_frequency(cases, controls, output) | |
| if options.allele_frequency_plot: | |
| allele_frequency_plot(options.allele_frequency_plot) | |
| if not options.maf_filter is None: | |
| maf_filter(cases, controls, output, options.maf_filter) | |
| if options.hwe_asymptotic: | |
| hwe(cases, controls, output, 'asymptotic') | |
| if options.hwe_exact: | |
| hwe(cases, controls, output, 'exact') | |
| if options.hwe_plot: | |
| hwe_plot(options.hwe_plot) | |
| if not options.hwe_asymptotic_filter is None: | |
| hwe_filter(cases, controls, output, options.hwe_asymptotic_filter, 'asymptotic') | |
| if not options.hwe_exact_filter is None: | |
| hwe_filter(cases, controls, output, options.hwe_exact_filter, 'exact') | |
| if options.ld: | |
| ld(cases, controls, options.ld[0], options.ld[1], output) | |
| if not options.association_test is None: | |
| association_test(cases, controls, output, options.association_test) | |
| if not options.manhattan is None: | |
| manhattan(options.manhattan, output) | |
| if not options.qqplot is None: | |
| qqplot(options.qqplot, output) | |
| if not options.get_info is None: | |
| get_info(cases, controls, [options.get_info]) | |
| if not options.independent is None: | |
| independent(cases, controls, output, options.independent) | |
| if not options.get_info_independent is None: | |
| get_info_independent(cases, controls, output, options.get_info_independent) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment