Created
September 4, 2018 05:48
-
-
Save cgpu/70acbc09e1d9fe64cc1bdccdd0b72b39 to your computer and use it in GitHub Desktop.
Analysis pipeline for filtering of SNPs. Not DRY or fast. For archiving purposes and future revisit.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# TO RUN JUST TYPE: | |
# python3 snp_selection.py | |
# INPUT FILES AND OPTIONS WILL BE REQUESTED AS USER INPUT | |
# THE FOLLOWING PROGRAMME IS PART OF the PACtool package (https://pypi.python.org/pypi/PACtool) | |
# OUTPUT FILES FROM PACtool GWAS ANALYSIS CAN BE ANALYZED | |
# FOR THE SELECTION OF STATISTICALLY SIGNIFICANT SNPs. | |
# OPTIONS TO PERFORM EITHER COMBINATION OF THE FOLLOWING TESTS ARE AVAILABLE: | |
# Minor Allele Frequency Screening; only keep SNPs with MAF > 0.05 | |
# HWE; only keep samples with Hardy Weiberg Expectation p-value > 0.001 | |
# Association Test; option to select p-value threshold (p < 0.000001 recommended ) | |
# INPUT FILES SHOULD BE THE PACtool OUTPUT FILES | |
# WITH THE FOLLOWING FILE EXTENSION SUFFIXES: | |
# | |
# .frequency | |
# .hwe | |
# .association | |
# | |
# FOR MORE DETAILS TYPE: | |
# python pactool.py -h | |
import csv # To store output of screening in a file | |
import timeit # For checking runtime in secs | |
import math # Just to round up runtime float, with math.ceil(runtime) | |
freqs_start = timeit.default_timer() | |
#Naming the lists that we will use in the end to take the SNPs that satisfy all selected tests' criteria: | |
#Leaving them empty allows for flexibility if a test is not selected. | |
maf_intersection_list = [] | |
hwe_intersection_list = [] | |
assoc_intersection_list = [] | |
########################################################################################################### | |
# START OF MINOR ALLELE FREQUENCY TEST BLOCK: | |
########################################################################################################### | |
MAF_threshold = 0.05 | |
print("Please select tests for your SNP screening:") | |
input_maf = """\nMAF Test: | |
This test will keep only the SNPs with Minor Allele Frequency (MAF) > 0.05. | |
Proceed [y/n]:\n""" | |
maf_test = input(input_maf) | |
if maf_test == "y": | |
maf_file = input("\nEnter output.frequency filename (e.g chr20.frequency) from PACtool analysis:\n\n") | |
try: | |
with open(maf_file) as f: | |
freqs_LIST = [x.split() for x in f] | |
# freqs_LIST: A LIST, which each row of the file as sublist | |
# [i] print(len(freqs_LIST), freqs_LIST[0:2]) | |
# [o] 195146 | |
# [ | |
# ['snp_code', 'ref_freq_control', 'alt_freq_control', 'ref_freq_cases', 'alt_freq_cases', 'ref_freq_total', 'alt_freq_total'], | |
# ['snp_0', '0.802', '0.198', '0.785', '0.215', '0.7935', '0.2065'] | |
# ] | |
freqs_cols = [x for x in zip(*freqs_LIST)] | |
# freqs_cols: A list, with 7 tuples; | |
# 0.snp_code ______freqs_cols[0] ('snp_code', 'snp_0', 'snp_1') | |
# 1.ref_freq_control | |
# 2.alt_freq_control | |
# 3.ref_freq_cases | |
# 4.alt_freq_cases | |
# 5.ref_freq_total ______freqs_cols[5] | |
# 6.alt_freq_total ______freqs_cols[6] | |
#BEFORE KEEPING SNPs WITH MAF > 0.05 | |
#WE HAVE TO FIND WHICH ONE IS THE MAF. | |
#CREATING THE LISTS and COUNTERS NEEDED | |
#TO KEEP MINOR ALLELE FREQUENCY OF EACH SNPs | |
#Read below, the if loop for more details: | |
snp_code_freqs = list(freqs_cols[0][1:]) #freqs_cols[0][0] is the header, so we start from [0][1] | |
ref_freqs = list(freqs_cols[5][1:]) #refers to total observed frequency in the population for the REF allele | |
alt_freqs = list(freqs_cols[6][1:]) #refers to total observed frequency in the population for the ALT allele | |
minor_snps_kept = [] #the very same list as this one: snp_code_freqs = list(freqs_cols[0][1:]) | |
minor_freqs_kept = [] #a list that will hold the value of the minor allele frequency for each snp_code | |
minor_freqs_from = [] #a list, with len= 195145, containing these strings: "ref", "alt" | order stays true to the above lists | |
#thus we could know which is the minor allele for each snp | |
ref_minor_snps = [] #a list that stores the snp_codes of snps with the REF as the minor allele | |
ref_minor_counter = 0 #a counter that counts how many snps have the ref as minor allele | |
#len(ref_minor_snps) == ref_minor_counter == minor_freqs_from.count("ref")| check if True | |
alt_minor_snps = [] #a list that stores the snp_codes of snps with the ALT as the minor allele | |
alt_minor_counter = 0 #a counter that counts how many snps have the ALT as minor allele | |
#len(alt_minor_snps) == alt_minor_counter == minor_freqs_from.count("alt")| check if True | |
# FINDING THE MINOR ALLELE FOR EACH SNP: | |
# x ____ snp_code_freqs | |
# q ____ ref_freqs | |
# z ____ alt_freqs | |
# The loop below compares the values of ref_freq_total and alt_freq_total of a snp, | |
# the value that is smaller of the two, is the minor allele frequency | |
for x,q,z in zip(snp_code_freqs, ref_freqs, alt_freqs ): #order is super important here - be meticulous | |
if float(q) < float(z): #IF aka REF is the minor allele: | |
#THEN: | |
minor_freqs_kept.append(float(q)) # add that value to the list with the minor allele freqs | |
minor_snps_kept.append(x) # add the respective snp_code for that value in the list that holds the snp_codes | yes, redundant list; either redundant or ugly | |
ref_minor_snps.append(x) # add the respective snp_code for that value also in ref_minor_snps | only if minor is ref here | |
minor_freqs_from.append("ref") # add a "ref", in the list minor_freqs_from, for the respective snp | |
ref_minor_counter = ref_minor_counter + 1 # add 1 to the counter so that we know how many SNPs have the REF as minor allele | |
else: #ELSE: aka if float(y) > float(z), aka ALT is the minor allele: | |
minor_freqs_kept.append(float(z)) #THEN: | |
minor_snps_kept.append(x) # add that value to the list with the minor allele freqs | |
alt_minor_snps.append(x) # add the respective snp_code for that value in the list that holds the snp_codes | |
minor_freqs_from.append("alt") # add an "alt", in the list minor_freqs_from, for the respective snp | |
alt_minor_counter = alt_minor_counter + 1 # add 1 to the counter so that we know how many SNPs have the ALT as minor allele | |
#STORING snp_code, minor_freq, ref/alt for each SNP | |
#as a sublist in a LIST, that holds as many sublists | |
#as the SNPs; 195145 that is. | |
#So, len(kept_LIST)==195145 (check if True) | |
#kept_LIST[i][0] is snp_code | |
#kept_LIST[i][1] is the value of minor allele freq for snp_i | |
#kept_LIST[i][2] is either "ref" or "alt"; denotes which one is the minor for snp_i | |
############################################################################################################################## | |
#COMMENTED OUT BLOCK THAT CREATES A FILE WITH ALL SNPs, EVEN < 0.05 MINOR ALLELE FREQ | |
##minor_LIST = [list(x) for x in zip(freqs_snps_kept, minor_freqs_kept, minor_freqs_from)] | |
## with open("minor_freq_SNPs.txt", "w") as q: #just changing the usual f to q, for avoiding conflict of variable claiming | |
## writer1 = csv.DictWriter(q, fieldnames = ["snp_code", "minor_freq", "minor_allele"], delimiter = ' ') | |
## writer1.writeheader() | |
## writer2 = csv.writer(q, delimiter =' ') | |
## writer2.writerows(minor_LIST) | |
############################################################################################################################### | |
# CREATING THE LISTS and COUNTERS NEEDED | |
# TO ONLY KEEP SNPs with ALLELE FREQUENCY > 0.05 | |
# Read comments above the following if-Loop for more details: | |
final_minor_freqs = [] | |
final_minor_snps = [] | |
final_minor_allele = [] | |
final_minor_counter = 0 | |
final_minor_out = 0 | |
# DISCARDING SNPs with minor allele frequency < 0.05: | |
# w_________freqs_snps_kept | |
# e_________minor_freqs_kept | |
# r_________minor_freqs_from | |
for w,e,r in zip(minor_snps_kept, minor_freqs_kept, minor_freqs_from): | |
if float(e) > 0.05: #(ONLY) IF the minor allele frequency > 0.05 (hint:KEEP SNP) | |
#THEN: | |
final_minor_freqs.append(float(e)) #add that value to the list with the final SNPs' minor allele freqs | |
final_minor_snps.append(w) #add the respective snp_code for that value in the list that holds the snp_codes | |
maf_intersection_list.append(w) #add the respective snp_code in the list for the intersection btween tests | |
final_minor_allele.append(r) #add the "ref" or "alt", to denote the minor allele | |
final_minor_counter = final_minor_counter + 1 #add 1 to the counter so that we know how many SNPs have minor all freq > 0.05 | |
else: | |
final_minor_out = final_minor_out + 1 #add 1 to the counter so that we know how many SNPs have minor all freq < 0.0 | |
#final_minor_counter+final_minor_out == 195145 | check if True | |
######################################################################################################################################### | |
# FEEL FREE TO COMMENT OUT THE FOLLOWING BLOCK THAT STORES SNPs IN A FILE. | |
# IF YOU NEED TO SAVE TIME AND DO NOT NEED THE OUTPUT FILE, | |
# YOUR SNPs WILL BE ONLY STORED IN THE LIST: final_minor_snps | |
# *NOTE* | |
# PREFERABLY COMMENT OUT CODE WITH 2 HASHTAGS, TO DENOTE THAT IT IS NOT COMMENTS BUT "SILENCED" CODE | |
########################################################################################################################################## | |
FILE_minor_start = timeit.default_timer() | |
final_minor_LIST = [list(x) for x in zip(final_minor_snps, final_minor_freqs, final_minor_allele)] | |
with open("maf_screening_snps.txt", "w") as e: | |
writer1 = csv.DictWriter(e, fieldnames = ["snp_code", "minor_freq", "minor_allele"], delimiter = ' ') | |
writer1.writeheader() | |
writer2 = csv.writer(e, delimiter =' ') | |
writer2.writerows(final_minor_LIST) | |
FILE_minor_end = timeit.default_timer() | |
FILE_minor_runtime = FILE_minor_end - FILE_minor_start | |
########################################################################################################################################## | |
freqs_end = timeit.default_timer() | |
freqs_runtime = freqs_end - freqs_start | |
print(">MINOR ALLELE FREQUENCY SCREENING:") | |
print(final_minor_out, "SNPs were discarded from total (", len(snp_code_freqs) ,") in the Minor Allele Frequency screening.") | |
print(len(final_minor_snps),"SNPs with Minor Allele Frequency >", MAF_threshold ,"were kept in the Minor Allele screening.") | |
print("Minor Allele Frequency Screening Runtime:", math.ceil(freqs_runtime), "seconds\n") | |
#print("FILE storing Runtime:", math.ceil(FILE_minor_runtime), "seconds\n") | |
#print("FILE storing Runtime:", FILE_minor_runtime, "seconds\n") | |
print("The SNPs with Minor Allele Frequency >", MAF_threshold ,"have been saved in the file: maf_screening_snps.txt ") | |
except: | |
print("""\nFile not found or not in the required format. | |
Please make sure: | |
-You have spelled the filename correctly. | |
-The file is in the directory you run the programme. | |
-The file is in the requested format. | |
*NOTE* | |
For help and file format info type: | |
python3 pactool.py -h | |
""") | |
else: | |
no_maf_file = "Minor Allele Frequency test not selected" | |
print(no_maf_file) | |
########################################################################################################### | |
# END OF MINOR ALLELE FREQUENCY TEST BLOCK: | |
########################################################################################################### | |
########################################################################################################### | |
# START OF HWE TEST BLOCK: | |
########################################################################################################### | |
hwe_start = timeit.default_timer() | |
HWE_threshold = 0.001 | |
input_hwe ="""\nHWE Test: | |
This test will keep only the SNPs with Hardy Weinberg expectation p-value > 0.001. | |
Proceed [y/n]:\n""" | |
hwe_test = input(input_hwe) | |
if hwe_test == "y": | |
hwe_file = input("\nEnter output.hwe filename (e.g analysis.hwe) from PACtool analysis.\n") | |
try: | |
with open(hwe_file) as f: | |
hwe_LIST = [x.split() for x in f] # hwe_LIST: A LIST, which each row of the file as sublist | |
# [i] print(len(hwe_LIST), hwe_LIST[0:2]) | |
# [o] 195146 [['snp_code', 'hwe_statistic', 'p-value'], ['snp_0', '0.420491403517', '0.810385108461']] | |
hwe_cols = [x for x in zip(*hwe_LIST)] # hwe_cols: A list, with 3 tuples; could use list(x) for x if LIST of sublists needed | |
# 1st tuple = 1st column, snp_code | |
# len(hwe_cols[0]) = 195146 | |
# hwe_cols[0][0:3]: ('snp_code', 'snp_0', 'snp_1') | |
# 2nd tuple = 2nd column, hwe_statistic | |
# len(hwe_cols[1]) = 195146 | |
# hwe_cols[1][0:3]: ('hwe_statistic', '0.420491403517', '0.320919559327') | |
# 3rd tuple = 3rd column, p_value | |
# len(hwe_cols[2]) = 195146 | |
# hwe_cols[2][0:3]: ('p-value', '0.810385108461', '0.851752080638') | |
#NOTE: | |
#The output.hwe has a header; hence the [1:] below | |
hwe_pvalues = list(hwe_cols[2][1:]) # [i] print(len(pvalues_keep)) | |
# [o] 195145 | |
hwe_SNPs = list(hwe_cols[0][1:]) # [i] print(len(snp_keep)) | |
# [o] 195145 | |
hwe_pvals_kept = [] # hwe_pvals_kept: a list that will store all HWE pvalues > 0.001 | |
hwe_snps_kept = [] # hwe_snps_kept: a list that will store all snp_codes, from sample with HWE pvalues > 0.001 | |
hwe_count_out = 0 # hwe_count_out: a counter that will count how many samples were discarted in the HWE filtering | |
for x,y in zip(hwe_pvalues,hwe_SNPs): | |
if float(x) > HWE_threshold: #aka: if pvalue > 0.001 | |
hwe_pvals_kept.append(float(x)) | |
hwe_snps_kept.append(y) #redundant list, yeap | |
hwe_intersection_list.append(y) | |
else: | |
hwe_count_out = hwe_count_out + 1 | |
# AT THIS POINT WE HAVE A LIST WITH SNP_CODES OF SNPs THAT PASSED THE HWE pvalue TEST | |
# THIS LIST WILL BE COMPARED TO THE RESPECTIVE LISTS THAT WILL BE GENERATED IN THE NEXT TWO TESTS: | |
# (ii)SNPs with minor allele frequencies > 0.05 and | |
# (iii)statistically significant SNPs from the ASSOCIATION TEST | |
# FINALLY, THE INTERSECTION OF ALL THREE LISTS WILL BE THE SELECTED SNPs FOR FURTHER ANALYSIS | |
# HWE test SNPs THAT PASSED THE "expectation p-value < 0.001" TEST | |
# ARE STORED NOW IN THE LIST NAMED: hwe_snps_kept | |
# WILL BE ALSO STORED IN A FILE NAMED: hwe_screening_SNPs.txt | |
# FEEL FREE TO COMMENT OUT THE FOLLOWING BLOCK THAT STORES SNPs IN A FILE, | |
# IF YOU NEED TO SAVE TIME AND DO NOT NEED THE OUTPUT FILE. | |
# YOUR SNPs WILL BE ONLY STORED IN THE LIST: hwe_snps_kept | |
# *NOTE* | |
# PREFERABLY COMMENT OUT CODE WITH 2 HASHTAGS, TO DENOTE THAT IT IS NOT COMMENTS BUT "SILENCED" CODE. | |
############################################################################################### | |
FILE_start = timeit.default_timer() | |
hwe_kept_LIST = [list(x) for x in zip(hwe_snps_kept, hwe_pvals_kept)] | |
with open("hwe_screening_SNPs.txt", "w") as h: | |
writer1 = csv.DictWriter(h, fieldnames = ["snp_code", "hwe_pvalue"], delimiter = ' ') | |
writer1.writeheader() | |
writer2 = csv.writer(h, delimiter =' ') | |
writer2.writerows(hwe_kept_LIST) | |
#NOTE: hwe_screening_SNPs.txt file has a header, so it should be: | |
#[i]: wc -l hwe_screening_SNPs.txt | |
#[o]: len(hwe_snps_kept + 1 | here: 192207 + 1 = 192208 | |
FILE_end = timeit.default_timer() | |
FILE_runtime = FILE_end - FILE_start | |
################################################################################################# | |
hwe_end = timeit.default_timer() | |
hwe_runtime = hwe_end - hwe_start | |
print(">HWE SCREENING:") | |
print(hwe_count_out, "SNPs were discarded from total (", len(hwe_SNPs),")in the HWE screening.") | |
print(len(hwe_snps_kept),"SNPs with expectaton pvalue >", HWE_threshold ,"were kept in the HWE screening.") | |
print("HWE Screening Runtime:", math.ceil(hwe_runtime), "seconds\n") | |
#print("FILE storing Runtime:", math.ceil(FILE_runtime), "seconds\n") | |
#print("HWE Screening Runtime:", hwe_runtime, "seconds\n") | |
#print("FILE storing Runtime:", FILE_runtime, "seconds\n") | |
print("The SNPs with Hardy Weiberg expectaton pvalue >", HWE_threshold , " have been saved in the file: hwe_screening_SNPs.txt ") | |
except: | |
print("""\nFile not found or not in the required format. | |
Please make sure: | |
-You have spelled the filename correctly. | |
-The file is in the directory you run the programme. | |
-The file is in the requested format. | |
*NOTE* | |
For help and file format info type: | |
python3 pactool.py -h | |
""") | |
else: | |
no_hwe_file = "HWE test not selected" | |
print(no_hwe_file) | |
########################################################################################################### | |
# END OF HWE TEST BLOCK: | |
########################################################################################################### | |
########################################################################################################### | |
# START OF ASSOCIATION TEST BLOCK: | |
########################################################################################################### | |
assoc_start = timeit.default_timer() | |
input_assoc = """\nAssociation Test: | |
This test will keep only statistically significant SNPs. | |
Proceed [y/n]:\n""" | |
assoc_test = input(input_assoc) | |
if assoc_test == "y": | |
try: | |
assoc_file = input("\nEnter output.association filename (e.g chr20.association) from PACtool analysis:\n") | |
ASSOC_threshold = input("\nPlease select p-value threshold as a float number, e.g. 0.0001\nRecommended threshold 0.000001 or lower.\n") | |
with open('outputassoc_total.txt') as asc: | |
assoc_LIST = [x.split() for x in asc] # assoc_LIST: A LIST, which each row of the file as sublist | |
# len(assoc_LIST) = 144192 | |
# assoc_LIST[0]): ['snp_code', 'locus', 'ref', 'alt', 'p-value', 'OR_rr_ra', 'OR_rr_aa', 'OR_ra_aa'] | |
assoc_cols = [x for x in zip(*assoc_LIST)] # assoc_cols: A list, with 8 tuples; | |
# assoc_cols[0]___1. snp_code | |
# 2. locus | |
# 3. ref | |
# 4. alt | |
# assoc_cols[4]___5. p-value | |
# 6. OR_rr_ra | |
# 7. OR_rr_aa | |
# 8. OR_ra_aa | |
assoc_pvalues = list(assoc_cols[4][1:]) # [1:] ,because of the header, | |
# [i] print(len(assoc_pvalues), assoc_pvalues[0:2]) | |
# [o] 144191 ['0.591069437423', '0.467213155849'] | |
assoc_SNPs = list(assoc_cols[0][1:]) # Skip HEADER again! [1:] | |
# [i] print(len(assoc_SNPs), assoc_SNPs[0:4]) | |
# [o] 144191 ['snp_0', 'snp_2', 'snp_3', 'snp_4'] | |
assoc_pvals_kept = [] # assoc_pvals_kept: a list that'll store all assoc test samples w/ pvalues < 0.001 | |
assoc_snps_kept = [] # assoc_snps_kept: a list that'll store all snp_codes, from all assoc test samples w/ pvalues < 0.001 | |
assoc_count_out = 0 # assoc_count_out: a counter for how many samples were discarted in the assoc test pvalues < 0.001 filtering | |
for x,y in zip(assoc_pvalues,assoc_SNPs): | |
if float(x) < float(ASSOC_threshold): | |
assoc_pvals_kept.append(float(x)) | |
assoc_snps_kept.append(y) | |
assoc_intersection_list.append(y) #for the final comparison of SNPs left in all selected tests | |
else: | |
assoc_count_out = assoc_count_out + 1 | |
# AT THIS POINT WE HAVE A LIST WITH SNP_CODES OF SNPs THAT PASSED THE Association pvalue TEST | |
# THIS LIST WILL BE COMPARED TO THE RESPECTIVE LISTS THAT WILL BE GENERATED IN OTHER TWO TESTS: | |
# (i) HWE p-value > 0.001 | |
# (ii)SNPs with minor allele frequencies > 0.05 and | |
# FINALLY, THE INTERSECTION OF ALL THREE LISTS WILL BE THE SELECTED SNPs FOR FURTHER ANALYSIS | |
# SNPs THAT PASSED THE "Association Test p-value < 0.000001" TEST | |
# ARE STORED NOW IN THE LIST NAMED: assoc_snps_kept | |
# ARE STORED ALSO IN A FILE NAMED: assoc_screening_SNPs.txt | |
# FEEL FREE TO COMMENT OUT THE FOLLOWING BLOCK THAT STORES SNPs IN A FILE, | |
# IF YOU NEED TO SAVE TIME AND DO NOT NEED THE OUTPUT FILE. | |
# YOUR SNPs WILL BE ONLY STORED IN THE LIST: assoc_snps_kept | |
# *NOTE* | |
# PREFERABLY COMMENT OUT CODE WITH 2 HASHTAGS, TO DENOTE THAT IT IS NOT COMMENTS. | |
############################################################################################### | |
FILE_assoc_start = timeit.default_timer() | |
assoc_kept_LIST = [list(x) for x in zip(assoc_snps_kept, assoc_pvals_kept)] | |
with open("assoc_screening_SNPs.txt", "w") as a: | |
writer1 = csv.DictWriter(a, fieldnames = ["snp_code", "assoc_pvalue"], delimiter = ' ') | |
writer1.writeheader() | |
writer2 = csv.writer(a, delimiter =' ') | |
writer2.writerows(assoc_kept_LIST) | |
FILE_assoc_end = timeit.default_timer() | |
FILE_assoc_runtime = FILE_assoc_end - FILE_assoc_start | |
################################################################################################ | |
assoc_end = timeit.default_timer() | |
assoc_end = timeit.default_timer() | |
assoc_runtime = assoc_end - assoc_start | |
print(">ASSOCIATION TEST SCREENING:") | |
print(assoc_count_out, "SNPs were discarded from total (",len(assoc_SNPs) ,") of file outputassoc_total.txt, during the Association Test screening.") | |
print(len(assoc_snps_kept),"SNPs with Association Test p-value <", ASSOC_threshold ," were kept in the Association Test screening.") | |
print("Association Test Screening Runtime:", math.ceil(assoc_runtime), "seconds\n") | |
#print("FILE storing Runtime:", math.ceil(FILE_assoc_runtime), "seconds\n") | |
#print("Association Test Screening Runtime:", assoc_runtime, "seconds\n") | |
#print("FILE storing Runtime:", FILE_assoc_runtime, "seconds\n") | |
print("The SNPs with Association Test p-value <",ASSOC_threshold ," have been saved in the file: assoc_screening_SNPs.txt \n") | |
except: | |
print("""\nFile not found or not in the required format. | |
Please make sure: | |
-You have entered a valid float number for p-value. | |
-You have spelled the filename correctly. | |
-The file is in the directory you run the programme. | |
-The file is in the requested format. | |
*NOTE* | |
For help and file format info type: | |
python3 pactool.py -h | |
""") | |
else: | |
no_assoc_file = "Association Test not selected." | |
print(no_assoc_file) | |
########################################################################################################### | |
# END OF ASSOCIATION TEST BLOCK: | |
########################################################################################################### | |
# INTERSECTION MADmESS: | |
# All possible test combinations the user might have chosen. | |
# -Find the intersections of the seleted SNPs for each combo | |
# -Keeping the snp_code in 1 column files: | |
# 1: maf + hwe + assoc | |
if maf_test == "y" and hwe_test == "y" and assoc_test == "y": | |
maf_hwe_assoc = (set(maf_intersection_list+assoc_intersection_list).intersection(hwe_intersection_list)) | |
print('\n'.join(maf_hwe_assoc), file=open("maf_hwe_assoc_snps.txt", "w") ) | |
print("After MAF, HWE and association test", len(maf_hwe_assoc), " SNPs were left") | |
print("A list with the selected SNPs has been saved in the file: maf_hwe_assoc_snps.txt") | |
else: | |
pass | |
# 2: maf + hwe | |
if maf_test == "y" and hwe_test == "y" and assoc_test != "y": | |
maf_hwe = (set(maf_intersection_list).intersection(hwe_intersection_list)) | |
print('\n'.join(maf_hwe), file=open("maf_hwe_snps.txt", "w") ) | |
print("After MAF and HWE test", len(maf_hwe), " SNPs were left") | |
print("A list with the selected SNPs has been saved in the file: maf_hwe_snps.txt") | |
else: | |
pass | |
# 3: maf + assoc | |
if maf_test == "y" and hwe_test != "y" and assoc_test == "y": | |
maf_assoc = (set(maf_intersection_list).intersection(assoc_intersection_list)) | |
print('\n'.join(maf_assoc), file=open("maf_assoc_snps.txt", "w") ) | |
print("After MAF and association test", len(maf_assoc), " SNPs were left") | |
print("A list with the selected SNPs has been saved in the file: maf_assoc_snps.txt") | |
else: | |
pass | |
# 4: hwe + assoc | |
if maf_test != "y" and hwe_test == "y" and assoc_test == "y": | |
hwe_assoc = (set(assoc_intersection_list).intersection(hwe_intersection_list)) | |
print('\n'.join(hwe_assoc), file=open("hwe_assoc_snps.txt", "w") ) | |
print("After HWE and association test", len(hwe_assoc), " SNPs were left") | |
print("A list with the selected SNPs has been saved in the file: hwe_assoc_snps.txt") | |
else: | |
pass | |
# 5: hwe | |
if maf_test != "y" and hwe_test == "y" and assoc_test != "y": | |
print('\n'.join(hwe_intersection_list), file=open("hwe_snps.txt", "w") ) | |
print(len(hwe_intersection_list), "passed the HWE threshold, p >", HWE_threshold, ".") | |
print("A list with the selected SNPs has been saved in the file: hwe_snps.txt") | |
else: | |
pass | |
# 6: maf | |
if maf_test == "y" and hwe_test != "y" and assoc_test != "y": | |
print('\n'.join(maf_intersection_list), file=open("maf_snps.txt", "w") ) | |
print(len(maf_intersection_list), "passed the Minor Allele Frequency Threshold >" , MAF_threshold, ".") | |
print("A list with the selected SNPs has been saved in the file: maf_snps.txt") | |
else: | |
pass | |
# 7: assoc | |
if maf_test != "y" and hwe_test != "y" and assoc_test == "y": | |
print('\n'.join(assoc_intersection_list), file=open("assoc_snps.txt", "w") ) | |
print(len(assoc_intersection_list),"passed the Association test threshold", ASSOC_threshold, ".") | |
print("A list with the selected SNPs has been saved in the file: assoc_snps.txt") | |
else: | |
pass | |
#print(len(maf_intersection_list)) | |
#print(len(hwe_intersection_list)) | |
#print(len(assoc_intersection_list)) | |
##################################################################################################################### | |
# | |
# T H E E N D | |
# | |
##################################################################################################################### |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment