Skip to content

Instantly share code, notes, and snippets.

@cgpu
Created September 4, 2018 05:48
Show Gist options
  • Save cgpu/70acbc09e1d9fe64cc1bdccdd0b72b39 to your computer and use it in GitHub Desktop.
Save cgpu/70acbc09e1d9fe64cc1bdccdd0b72b39 to your computer and use it in GitHub Desktop.
Analysis pipeline for filtering of SNPs. Not DRY or fast. For archiving purposes and future revisit.
#!/usr/bin/env python3
# TO RUN JUST TYPE:
# python3 snp_selection.py
# INPUT FILES AND OPTIONS WILL BE REQUESTED AS USER INPUT
# THE FOLLOWING PROGRAMME IS PART OF the PACtool package (https://pypi.python.org/pypi/PACtool)
# OUTPUT FILES FROM PACtool GWAS ANALYSIS CAN BE ANALYZED
# FOR THE SELECTION OF STATISTICALLY SIGNIFICANT SNPs.
# OPTIONS TO PERFORM EITHER COMBINATION OF THE FOLLOWING TESTS ARE AVAILABLE:
# Minor Allele Frequency Screening; only keep SNPs with MAF > 0.05
# HWE; only keep samples with Hardy Weiberg Expectation p-value > 0.001
# Association Test; option to select p-value threshold (p < 0.000001 recommended )
# INPUT FILES SHOULD BE THE PACtool OUTPUT FILES
# WITH THE FOLLOWING FILE EXTENSION SUFFIXES:
#
# .frequency
# .hwe
# .association
#
# FOR MORE DETAILS TYPE:
# python pactool.py -h
import csv # To store output of screening in a file
import timeit # For checking runtime in secs
import math # Just to round up runtime float, with math.ceil(runtime)
freqs_start = timeit.default_timer()
#Naming the lists that we will use in the end to take the SNPs that satisfy all selected tests' criteria:
#Leaving them empty allows for flexibility if a test is not selected.
maf_intersection_list = []
hwe_intersection_list = []
assoc_intersection_list = []
###########################################################################################################
# START OF MINOR ALLELE FREQUENCY TEST BLOCK:
###########################################################################################################
MAF_threshold = 0.05
print("Please select tests for your SNP screening:")
input_maf = """\nMAF Test:
This test will keep only the SNPs with Minor Allele Frequency (MAF) > 0.05.
Proceed [y/n]:\n"""
maf_test = input(input_maf)
if maf_test == "y":
maf_file = input("\nEnter output.frequency filename (e.g chr20.frequency) from PACtool analysis:\n\n")
try:
with open(maf_file) as f:
freqs_LIST = [x.split() for x in f]
# freqs_LIST: A LIST, which each row of the file as sublist
# [i] print(len(freqs_LIST), freqs_LIST[0:2])
# [o] 195146
# [
# ['snp_code', 'ref_freq_control', 'alt_freq_control', 'ref_freq_cases', 'alt_freq_cases', 'ref_freq_total', 'alt_freq_total'],
# ['snp_0', '0.802', '0.198', '0.785', '0.215', '0.7935', '0.2065']
# ]
freqs_cols = [x for x in zip(*freqs_LIST)]
# freqs_cols: A list, with 7 tuples;
# 0.snp_code ______freqs_cols[0] ('snp_code', 'snp_0', 'snp_1')
# 1.ref_freq_control
# 2.alt_freq_control
# 3.ref_freq_cases
# 4.alt_freq_cases
# 5.ref_freq_total ______freqs_cols[5]
# 6.alt_freq_total ______freqs_cols[6]
#BEFORE KEEPING SNPs WITH MAF > 0.05
#WE HAVE TO FIND WHICH ONE IS THE MAF.
#CREATING THE LISTS and COUNTERS NEEDED
#TO KEEP MINOR ALLELE FREQUENCY OF EACH SNPs
#Read below, the if loop for more details:
snp_code_freqs = list(freqs_cols[0][1:]) #freqs_cols[0][0] is the header, so we start from [0][1]
ref_freqs = list(freqs_cols[5][1:]) #refers to total observed frequency in the population for the REF allele
alt_freqs = list(freqs_cols[6][1:]) #refers to total observed frequency in the population for the ALT allele
minor_snps_kept = [] #the very same list as this one: snp_code_freqs = list(freqs_cols[0][1:])
minor_freqs_kept = [] #a list that will hold the value of the minor allele frequency for each snp_code
minor_freqs_from = [] #a list, with len= 195145, containing these strings: "ref", "alt" | order stays true to the above lists
#thus we could know which is the minor allele for each snp
ref_minor_snps = [] #a list that stores the snp_codes of snps with the REF as the minor allele
ref_minor_counter = 0 #a counter that counts how many snps have the ref as minor allele
#len(ref_minor_snps) == ref_minor_counter == minor_freqs_from.count("ref")| check if True
alt_minor_snps = [] #a list that stores the snp_codes of snps with the ALT as the minor allele
alt_minor_counter = 0 #a counter that counts how many snps have the ALT as minor allele
#len(alt_minor_snps) == alt_minor_counter == minor_freqs_from.count("alt")| check if True
# FINDING THE MINOR ALLELE FOR EACH SNP:
# x ____ snp_code_freqs
# q ____ ref_freqs
# z ____ alt_freqs
# The loop below compares the values of ref_freq_total and alt_freq_total of a snp,
# the value that is smaller of the two, is the minor allele frequency
for x,q,z in zip(snp_code_freqs, ref_freqs, alt_freqs ): #order is super important here - be meticulous
if float(q) < float(z): #IF aka REF is the minor allele:
#THEN:
minor_freqs_kept.append(float(q)) # add that value to the list with the minor allele freqs
minor_snps_kept.append(x) # add the respective snp_code for that value in the list that holds the snp_codes | yes, redundant list; either redundant or ugly
ref_minor_snps.append(x) # add the respective snp_code for that value also in ref_minor_snps | only if minor is ref here
minor_freqs_from.append("ref") # add a "ref", in the list minor_freqs_from, for the respective snp
ref_minor_counter = ref_minor_counter + 1 # add 1 to the counter so that we know how many SNPs have the REF as minor allele
else: #ELSE: aka if float(y) > float(z), aka ALT is the minor allele:
minor_freqs_kept.append(float(z)) #THEN:
minor_snps_kept.append(x) # add that value to the list with the minor allele freqs
alt_minor_snps.append(x) # add the respective snp_code for that value in the list that holds the snp_codes
minor_freqs_from.append("alt") # add an "alt", in the list minor_freqs_from, for the respective snp
alt_minor_counter = alt_minor_counter + 1 # add 1 to the counter so that we know how many SNPs have the ALT as minor allele
#STORING snp_code, minor_freq, ref/alt for each SNP
#as a sublist in a LIST, that holds as many sublists
#as the SNPs; 195145 that is.
#So, len(kept_LIST)==195145 (check if True)
#kept_LIST[i][0] is snp_code
#kept_LIST[i][1] is the value of minor allele freq for snp_i
#kept_LIST[i][2] is either "ref" or "alt"; denotes which one is the minor for snp_i
##############################################################################################################################
#COMMENTED OUT BLOCK THAT CREATES A FILE WITH ALL SNPs, EVEN < 0.05 MINOR ALLELE FREQ
##minor_LIST = [list(x) for x in zip(freqs_snps_kept, minor_freqs_kept, minor_freqs_from)]
## with open("minor_freq_SNPs.txt", "w") as q: #just changing the usual f to q, for avoiding conflict of variable claiming
## writer1 = csv.DictWriter(q, fieldnames = ["snp_code", "minor_freq", "minor_allele"], delimiter = ' ')
## writer1.writeheader()
## writer2 = csv.writer(q, delimiter =' ')
## writer2.writerows(minor_LIST)
###############################################################################################################################
# CREATING THE LISTS and COUNTERS NEEDED
# TO ONLY KEEP SNPs with ALLELE FREQUENCY > 0.05
# Read comments above the following if-Loop for more details:
final_minor_freqs = []
final_minor_snps = []
final_minor_allele = []
final_minor_counter = 0
final_minor_out = 0
# DISCARDING SNPs with minor allele frequency < 0.05:
# w_________freqs_snps_kept
# e_________minor_freqs_kept
# r_________minor_freqs_from
for w,e,r in zip(minor_snps_kept, minor_freqs_kept, minor_freqs_from):
if float(e) > 0.05: #(ONLY) IF the minor allele frequency > 0.05 (hint:KEEP SNP)
#THEN:
final_minor_freqs.append(float(e)) #add that value to the list with the final SNPs' minor allele freqs
final_minor_snps.append(w) #add the respective snp_code for that value in the list that holds the snp_codes
maf_intersection_list.append(w) #add the respective snp_code in the list for the intersection btween tests
final_minor_allele.append(r) #add the "ref" or "alt", to denote the minor allele
final_minor_counter = final_minor_counter + 1 #add 1 to the counter so that we know how many SNPs have minor all freq > 0.05
else:
final_minor_out = final_minor_out + 1 #add 1 to the counter so that we know how many SNPs have minor all freq < 0.0
#final_minor_counter+final_minor_out == 195145 | check if True
#########################################################################################################################################
# FEEL FREE TO COMMENT OUT THE FOLLOWING BLOCK THAT STORES SNPs IN A FILE.
# IF YOU NEED TO SAVE TIME AND DO NOT NEED THE OUTPUT FILE,
# YOUR SNPs WILL BE ONLY STORED IN THE LIST: final_minor_snps
# *NOTE*
# PREFERABLY COMMENT OUT CODE WITH 2 HASHTAGS, TO DENOTE THAT IT IS NOT COMMENTS BUT "SILENCED" CODE
##########################################################################################################################################
FILE_minor_start = timeit.default_timer()
final_minor_LIST = [list(x) for x in zip(final_minor_snps, final_minor_freqs, final_minor_allele)]
with open("maf_screening_snps.txt", "w") as e:
writer1 = csv.DictWriter(e, fieldnames = ["snp_code", "minor_freq", "minor_allele"], delimiter = ' ')
writer1.writeheader()
writer2 = csv.writer(e, delimiter =' ')
writer2.writerows(final_minor_LIST)
FILE_minor_end = timeit.default_timer()
FILE_minor_runtime = FILE_minor_end - FILE_minor_start
##########################################################################################################################################
freqs_end = timeit.default_timer()
freqs_runtime = freqs_end - freqs_start
print(">MINOR ALLELE FREQUENCY SCREENING:")
print(final_minor_out, "SNPs were discarded from total (", len(snp_code_freqs) ,") in the Minor Allele Frequency screening.")
print(len(final_minor_snps),"SNPs with Minor Allele Frequency >", MAF_threshold ,"were kept in the Minor Allele screening.")
print("Minor Allele Frequency Screening Runtime:", math.ceil(freqs_runtime), "seconds\n")
#print("FILE storing Runtime:", math.ceil(FILE_minor_runtime), "seconds\n")
#print("FILE storing Runtime:", FILE_minor_runtime, "seconds\n")
print("The SNPs with Minor Allele Frequency >", MAF_threshold ,"have been saved in the file: maf_screening_snps.txt ")
except:
print("""\nFile not found or not in the required format.
Please make sure:
-You have spelled the filename correctly.
-The file is in the directory you run the programme.
-The file is in the requested format.
*NOTE*
For help and file format info type:
python3 pactool.py -h
""")
else:
no_maf_file = "Minor Allele Frequency test not selected"
print(no_maf_file)
###########################################################################################################
# END OF MINOR ALLELE FREQUENCY TEST BLOCK:
###########################################################################################################
###########################################################################################################
# START OF HWE TEST BLOCK:
###########################################################################################################
hwe_start = timeit.default_timer()
HWE_threshold = 0.001
input_hwe ="""\nHWE Test:
This test will keep only the SNPs with Hardy Weinberg expectation p-value > 0.001.
Proceed [y/n]:\n"""
hwe_test = input(input_hwe)
if hwe_test == "y":
hwe_file = input("\nEnter output.hwe filename (e.g analysis.hwe) from PACtool analysis.\n")
try:
with open(hwe_file) as f:
hwe_LIST = [x.split() for x in f] # hwe_LIST: A LIST, which each row of the file as sublist
# [i] print(len(hwe_LIST), hwe_LIST[0:2])
# [o] 195146 [['snp_code', 'hwe_statistic', 'p-value'], ['snp_0', '0.420491403517', '0.810385108461']]
hwe_cols = [x for x in zip(*hwe_LIST)] # hwe_cols: A list, with 3 tuples; could use list(x) for x if LIST of sublists needed
# 1st tuple = 1st column, snp_code
# len(hwe_cols[0]) = 195146
# hwe_cols[0][0:3]: ('snp_code', 'snp_0', 'snp_1')
# 2nd tuple = 2nd column, hwe_statistic
# len(hwe_cols[1]) = 195146
# hwe_cols[1][0:3]: ('hwe_statistic', '0.420491403517', '0.320919559327')
# 3rd tuple = 3rd column, p_value
# len(hwe_cols[2]) = 195146
# hwe_cols[2][0:3]: ('p-value', '0.810385108461', '0.851752080638')
#NOTE:
#The output.hwe has a header; hence the [1:] below
hwe_pvalues = list(hwe_cols[2][1:]) # [i] print(len(pvalues_keep))
# [o] 195145
hwe_SNPs = list(hwe_cols[0][1:]) # [i] print(len(snp_keep))
# [o] 195145
hwe_pvals_kept = [] # hwe_pvals_kept: a list that will store all HWE pvalues > 0.001
hwe_snps_kept = [] # hwe_snps_kept: a list that will store all snp_codes, from sample with HWE pvalues > 0.001
hwe_count_out = 0 # hwe_count_out: a counter that will count how many samples were discarted in the HWE filtering
for x,y in zip(hwe_pvalues,hwe_SNPs):
if float(x) > HWE_threshold: #aka: if pvalue > 0.001
hwe_pvals_kept.append(float(x))
hwe_snps_kept.append(y) #redundant list, yeap
hwe_intersection_list.append(y)
else:
hwe_count_out = hwe_count_out + 1
# AT THIS POINT WE HAVE A LIST WITH SNP_CODES OF SNPs THAT PASSED THE HWE pvalue TEST
# THIS LIST WILL BE COMPARED TO THE RESPECTIVE LISTS THAT WILL BE GENERATED IN THE NEXT TWO TESTS:
# (ii)SNPs with minor allele frequencies > 0.05 and
# (iii)statistically significant SNPs from the ASSOCIATION TEST
# FINALLY, THE INTERSECTION OF ALL THREE LISTS WILL BE THE SELECTED SNPs FOR FURTHER ANALYSIS
# HWE test SNPs THAT PASSED THE "expectation p-value < 0.001" TEST
# ARE STORED NOW IN THE LIST NAMED: hwe_snps_kept
# WILL BE ALSO STORED IN A FILE NAMED: hwe_screening_SNPs.txt
# FEEL FREE TO COMMENT OUT THE FOLLOWING BLOCK THAT STORES SNPs IN A FILE,
# IF YOU NEED TO SAVE TIME AND DO NOT NEED THE OUTPUT FILE.
# YOUR SNPs WILL BE ONLY STORED IN THE LIST: hwe_snps_kept
# *NOTE*
# PREFERABLY COMMENT OUT CODE WITH 2 HASHTAGS, TO DENOTE THAT IT IS NOT COMMENTS BUT "SILENCED" CODE.
###############################################################################################
FILE_start = timeit.default_timer()
hwe_kept_LIST = [list(x) for x in zip(hwe_snps_kept, hwe_pvals_kept)]
with open("hwe_screening_SNPs.txt", "w") as h:
writer1 = csv.DictWriter(h, fieldnames = ["snp_code", "hwe_pvalue"], delimiter = ' ')
writer1.writeheader()
writer2 = csv.writer(h, delimiter =' ')
writer2.writerows(hwe_kept_LIST)
#NOTE: hwe_screening_SNPs.txt file has a header, so it should be:
#[i]: wc -l hwe_screening_SNPs.txt
#[o]: len(hwe_snps_kept + 1 | here: 192207 + 1 = 192208
FILE_end = timeit.default_timer()
FILE_runtime = FILE_end - FILE_start
#################################################################################################
hwe_end = timeit.default_timer()
hwe_runtime = hwe_end - hwe_start
print(">HWE SCREENING:")
print(hwe_count_out, "SNPs were discarded from total (", len(hwe_SNPs),")in the HWE screening.")
print(len(hwe_snps_kept),"SNPs with expectaton pvalue >", HWE_threshold ,"were kept in the HWE screening.")
print("HWE Screening Runtime:", math.ceil(hwe_runtime), "seconds\n")
#print("FILE storing Runtime:", math.ceil(FILE_runtime), "seconds\n")
#print("HWE Screening Runtime:", hwe_runtime, "seconds\n")
#print("FILE storing Runtime:", FILE_runtime, "seconds\n")
print("The SNPs with Hardy Weiberg expectaton pvalue >", HWE_threshold , " have been saved in the file: hwe_screening_SNPs.txt ")
except:
print("""\nFile not found or not in the required format.
Please make sure:
-You have spelled the filename correctly.
-The file is in the directory you run the programme.
-The file is in the requested format.
*NOTE*
For help and file format info type:
python3 pactool.py -h
""")
else:
no_hwe_file = "HWE test not selected"
print(no_hwe_file)
###########################################################################################################
# END OF HWE TEST BLOCK:
###########################################################################################################
###########################################################################################################
# START OF ASSOCIATION TEST BLOCK:
###########################################################################################################
assoc_start = timeit.default_timer()
input_assoc = """\nAssociation Test:
This test will keep only statistically significant SNPs.
Proceed [y/n]:\n"""
assoc_test = input(input_assoc)
if assoc_test == "y":
try:
assoc_file = input("\nEnter output.association filename (e.g chr20.association) from PACtool analysis:\n")
ASSOC_threshold = input("\nPlease select p-value threshold as a float number, e.g. 0.0001\nRecommended threshold 0.000001 or lower.\n")
with open('outputassoc_total.txt') as asc:
assoc_LIST = [x.split() for x in asc] # assoc_LIST: A LIST, which each row of the file as sublist
# len(assoc_LIST) = 144192
# assoc_LIST[0]): ['snp_code', 'locus', 'ref', 'alt', 'p-value', 'OR_rr_ra', 'OR_rr_aa', 'OR_ra_aa']
assoc_cols = [x for x in zip(*assoc_LIST)] # assoc_cols: A list, with 8 tuples;
# assoc_cols[0]___1. snp_code
# 2. locus
# 3. ref
# 4. alt
# assoc_cols[4]___5. p-value
# 6. OR_rr_ra
# 7. OR_rr_aa
# 8. OR_ra_aa
assoc_pvalues = list(assoc_cols[4][1:]) # [1:] ,because of the header,
# [i] print(len(assoc_pvalues), assoc_pvalues[0:2])
# [o] 144191 ['0.591069437423', '0.467213155849']
assoc_SNPs = list(assoc_cols[0][1:]) # Skip HEADER again! [1:]
# [i] print(len(assoc_SNPs), assoc_SNPs[0:4])
# [o] 144191 ['snp_0', 'snp_2', 'snp_3', 'snp_4']
assoc_pvals_kept = [] # assoc_pvals_kept: a list that'll store all assoc test samples w/ pvalues < 0.001
assoc_snps_kept = [] # assoc_snps_kept: a list that'll store all snp_codes, from all assoc test samples w/ pvalues < 0.001
assoc_count_out = 0 # assoc_count_out: a counter for how many samples were discarted in the assoc test pvalues < 0.001 filtering
for x,y in zip(assoc_pvalues,assoc_SNPs):
if float(x) < float(ASSOC_threshold):
assoc_pvals_kept.append(float(x))
assoc_snps_kept.append(y)
assoc_intersection_list.append(y) #for the final comparison of SNPs left in all selected tests
else:
assoc_count_out = assoc_count_out + 1
# AT THIS POINT WE HAVE A LIST WITH SNP_CODES OF SNPs THAT PASSED THE Association pvalue TEST
# THIS LIST WILL BE COMPARED TO THE RESPECTIVE LISTS THAT WILL BE GENERATED IN OTHER TWO TESTS:
# (i) HWE p-value > 0.001
# (ii)SNPs with minor allele frequencies > 0.05 and
# FINALLY, THE INTERSECTION OF ALL THREE LISTS WILL BE THE SELECTED SNPs FOR FURTHER ANALYSIS
# SNPs THAT PASSED THE "Association Test p-value < 0.000001" TEST
# ARE STORED NOW IN THE LIST NAMED: assoc_snps_kept
# ARE STORED ALSO IN A FILE NAMED: assoc_screening_SNPs.txt
# FEEL FREE TO COMMENT OUT THE FOLLOWING BLOCK THAT STORES SNPs IN A FILE,
# IF YOU NEED TO SAVE TIME AND DO NOT NEED THE OUTPUT FILE.
# YOUR SNPs WILL BE ONLY STORED IN THE LIST: assoc_snps_kept
# *NOTE*
# PREFERABLY COMMENT OUT CODE WITH 2 HASHTAGS, TO DENOTE THAT IT IS NOT COMMENTS.
###############################################################################################
FILE_assoc_start = timeit.default_timer()
assoc_kept_LIST = [list(x) for x in zip(assoc_snps_kept, assoc_pvals_kept)]
with open("assoc_screening_SNPs.txt", "w") as a:
writer1 = csv.DictWriter(a, fieldnames = ["snp_code", "assoc_pvalue"], delimiter = ' ')
writer1.writeheader()
writer2 = csv.writer(a, delimiter =' ')
writer2.writerows(assoc_kept_LIST)
FILE_assoc_end = timeit.default_timer()
FILE_assoc_runtime = FILE_assoc_end - FILE_assoc_start
################################################################################################
assoc_end = timeit.default_timer()
assoc_end = timeit.default_timer()
assoc_runtime = assoc_end - assoc_start
print(">ASSOCIATION TEST SCREENING:")
print(assoc_count_out, "SNPs were discarded from total (",len(assoc_SNPs) ,") of file outputassoc_total.txt, during the Association Test screening.")
print(len(assoc_snps_kept),"SNPs with Association Test p-value <", ASSOC_threshold ," were kept in the Association Test screening.")
print("Association Test Screening Runtime:", math.ceil(assoc_runtime), "seconds\n")
#print("FILE storing Runtime:", math.ceil(FILE_assoc_runtime), "seconds\n")
#print("Association Test Screening Runtime:", assoc_runtime, "seconds\n")
#print("FILE storing Runtime:", FILE_assoc_runtime, "seconds\n")
print("The SNPs with Association Test p-value <",ASSOC_threshold ," have been saved in the file: assoc_screening_SNPs.txt \n")
except:
print("""\nFile not found or not in the required format.
Please make sure:
-You have entered a valid float number for p-value.
-You have spelled the filename correctly.
-The file is in the directory you run the programme.
-The file is in the requested format.
*NOTE*
For help and file format info type:
python3 pactool.py -h
""")
else:
no_assoc_file = "Association Test not selected."
print(no_assoc_file)
###########################################################################################################
# END OF ASSOCIATION TEST BLOCK:
###########################################################################################################
# INTERSECTION MADmESS:
# All possible test combinations the user might have chosen.
# -Find the intersections of the seleted SNPs for each combo
# -Keeping the snp_code in 1 column files:
# 1: maf + hwe + assoc
if maf_test == "y" and hwe_test == "y" and assoc_test == "y":
maf_hwe_assoc = (set(maf_intersection_list+assoc_intersection_list).intersection(hwe_intersection_list))
print('\n'.join(maf_hwe_assoc), file=open("maf_hwe_assoc_snps.txt", "w") )
print("After MAF, HWE and association test", len(maf_hwe_assoc), " SNPs were left")
print("A list with the selected SNPs has been saved in the file: maf_hwe_assoc_snps.txt")
else:
pass
# 2: maf + hwe
if maf_test == "y" and hwe_test == "y" and assoc_test != "y":
maf_hwe = (set(maf_intersection_list).intersection(hwe_intersection_list))
print('\n'.join(maf_hwe), file=open("maf_hwe_snps.txt", "w") )
print("After MAF and HWE test", len(maf_hwe), " SNPs were left")
print("A list with the selected SNPs has been saved in the file: maf_hwe_snps.txt")
else:
pass
# 3: maf + assoc
if maf_test == "y" and hwe_test != "y" and assoc_test == "y":
maf_assoc = (set(maf_intersection_list).intersection(assoc_intersection_list))
print('\n'.join(maf_assoc), file=open("maf_assoc_snps.txt", "w") )
print("After MAF and association test", len(maf_assoc), " SNPs were left")
print("A list with the selected SNPs has been saved in the file: maf_assoc_snps.txt")
else:
pass
# 4: hwe + assoc
if maf_test != "y" and hwe_test == "y" and assoc_test == "y":
hwe_assoc = (set(assoc_intersection_list).intersection(hwe_intersection_list))
print('\n'.join(hwe_assoc), file=open("hwe_assoc_snps.txt", "w") )
print("After HWE and association test", len(hwe_assoc), " SNPs were left")
print("A list with the selected SNPs has been saved in the file: hwe_assoc_snps.txt")
else:
pass
# 5: hwe
if maf_test != "y" and hwe_test == "y" and assoc_test != "y":
print('\n'.join(hwe_intersection_list), file=open("hwe_snps.txt", "w") )
print(len(hwe_intersection_list), "passed the HWE threshold, p >", HWE_threshold, ".")
print("A list with the selected SNPs has been saved in the file: hwe_snps.txt")
else:
pass
# 6: maf
if maf_test == "y" and hwe_test != "y" and assoc_test != "y":
print('\n'.join(maf_intersection_list), file=open("maf_snps.txt", "w") )
print(len(maf_intersection_list), "passed the Minor Allele Frequency Threshold >" , MAF_threshold, ".")
print("A list with the selected SNPs has been saved in the file: maf_snps.txt")
else:
pass
# 7: assoc
if maf_test != "y" and hwe_test != "y" and assoc_test == "y":
print('\n'.join(assoc_intersection_list), file=open("assoc_snps.txt", "w") )
print(len(assoc_intersection_list),"passed the Association test threshold", ASSOC_threshold, ".")
print("A list with the selected SNPs has been saved in the file: assoc_snps.txt")
else:
pass
#print(len(maf_intersection_list))
#print(len(hwe_intersection_list))
#print(len(assoc_intersection_list))
#####################################################################################################################
#
# T H E E N D
#
#####################################################################################################################
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment