Skip to content

Instantly share code, notes, and snippets.

@cgpu
Last active October 15, 2018 13:10
Show Gist options
  • Save cgpu/7ef78ea5b49be7d6bc09af3258cf9707 to your computer and use it in GitHub Desktop.
Save cgpu/7ef78ea5b49be7d6bc09af3258cf9707 to your computer and use it in GitHub Desktop.
from JADBio_parser_args import JADBioparser
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
@author: cristina
Script for parsing all JAD output .txt files of an input_directory
to collect ref and equivalent signatures based on user defined threshold for ROC AUC.
User defined arguments:
`input_dir`
`output_dir`
`filtering by (1) Average AUC, (2) minimum AUC`; use 1 or 2 as AUC filtering flag
`filtering threshold`
"""
# TASK: iterate over all files in a given directory
# @stack: https://stackoverflow.com/questions/10377998/how-can-i-iterate-over-files-in-a-given-directory
def JADbio_parser(input_dir, output_dir, AUC_filtering, AUC_threshold):
print("_____________S T A R T _____________")
# Ze imports
from pandas import DataFrame
from pathlib import Path
import time
import sys
from pathlib import Path
# the Path call is a generator and fetches the filepaths on the fly
# To store the filepaths as strings, call the generator object `pathlist`
# Save prints in report file
pathlist = Path(input_dir).glob('**/*.txt')
# STORE JAD FILENPATHS IN A LIST
JAD_files_list = [str(path) for path in pathlist]
JAD_files_list = [ path.replace("\\", "/") for path in JAD_files_list]
# WHY -4? to remove '.txt', storing the name of the file which should point to which ctrl/case contrast it has
ctrl_case_contrast = [x[len(input_dir):-4] for x in JAD_files_list]
# LOOP OVER ALL FILES IN DIRECTORY; 1 file is 1 contrast
for i in range(0,len(JAD_files_list)):
print("\n\n>looping over files in the directory..\n")
print("\n")
# =============================================================================
# STEP 1:
# From now on, iterate over all files in the input dir:
# Remeber to return for looping over all files
# =============================================================================
filename = ctrl_case_contrast[i]
#JAD_output_file = "C:/Users/bruno/Dropbox/HVNT/50repeats/amide_pos_corrected_AD_controlAD_controlSQ_controlSCLC.txt"
JAD_output_file = JAD_files_list[i]
# READING THE .txt FILE LINE BY LINE IN A LIST
# leave out 1st line and ---- [1:]
# leave out empty lines ---- if x!="\n"
# LOOP OVER ALL FILES IN THE INPUT DIR
# Read 1 file of directory
### for JAD_output_file in JAD_files_list
print("Reading file with contrast: ", JAD_files_list[i], "\n")
print("File " + str(i+1) + " of " + str(len(JAD_files_list)))
print("\n")
# Read file
with open(JAD_output_file) as f:
lis = [x for x in f if x!="\n"][1:]
# [1:] first row is the header "selected" so leaving it out
# for passing our data in a numpy data structure.
index_of_Equivalent_Signatures = lis.index('Equivalent Signatures \n') # index = 3
index_of_Metric_Results = lis.index('Metric results\n')
# Pass for now, not using this in printed dataframe but could
selected_model_configuration = lis[1][:-1]
COLUMN_1 = selected_model_configuration
# Area_Under_ROC_Curve
AUC_of_ROC_row = lis[index_of_Metric_Results+1]
index_of_OPEN_parenth = AUC_of_ROC_row.find('(')
index_of_CLOSE_parenth = AUC_of_ROC_row.find(')')
index_of_colon = AUC_of_ROC_row.find('e:')
index_of_CI_colon = AUC_of_ROC_row.find('I:')
index_of_comma = AUC_of_ROC_row.find(',')
# Aftre grepping anchor chars, assign AUC values based on string slicicng
AUC_of_ROC = round(float(AUC_of_ROC_row[index_of_colon + 3 : index_of_OPEN_parenth]) , 3)
Lower_AUC_of_ROC_limit = round(float(AUC_of_ROC_row[index_of_CI_colon + 3 : index_of_comma] ) , 3)
Upper_AUC_of_ROC_limit = round(float(AUC_of_ROC_row[index_of_comma + 2 : index_of_CLOSE_parenth]), 3)
# Accuracy
accuracy_row = lis[index_of_Metric_Results+2]
index_of_OPEN_parenth = accuracy_row.find('(')
index_of_CLOSE_parenth = accuracy_row.find(')')
index_of_colon = accuracy_row.find('y:')
index_of_CI_colon = accuracy_row.find('I:')
index_of_comma = accuracy_row.find(',')
accuracy = round(float(accuracy_row[index_of_colon + 3 : index_of_OPEN_parenth]) , 3)
Lower_accuracy_limit = round(float(accuracy_row[index_of_CI_colon + 3 : index_of_comma] ) , 3)
Upper_accuracy_limit = round(float(accuracy_row[index_of_comma + 2 : index_of_CLOSE_parenth]), 3)
# =============================================================================
# # Creating a list with the ref selected featureset plus their selection frequency
# =============================================================================
index_of_ref_sign = lis.index("Reference Signature \n")
ref_signature = lis[(index_of_ref_sign + 1)].split() # selecting the line that corresponds to the reference signature
index_of_Equivalent_Signatures = lis.index('Equivalent Signatures \n') # index = 3
index_of_Metric_Results = lis.index('Metric results\n')
# LEN(equiv) + 1 to ommit the 'Equivalent Signatures' line
# 1 line is one string, the list elements are strings
equiv_signatures = lis[index_of_Equivalent_Signatures+1:index_of_Metric_Results]#lis[len(JAD_6_from_start)+1:-len(JAD_13_from_end)]
# Split the strings to isolate each feature, omit the "\n" from the string
equiv_signatures_LIST = [x.split() for x in equiv_signatures if x!="\n"]
#VINCENZO's dataframe!
# ref in first position of list, then equivalent follow
# Create empty LISTS of sublists, one sublist breakdwon: sublists[0] == ref feat, all the rest elements, equiv of the former ref feat
refequiv_every_ref_its_equiv = [ [] for j in range(0,len(ref_signature)) ]
# LIST of sublists; needed because the JAD output includes the ref feat in the equivalents, so dupes created
EQUIV_unique = [ [] for j in range(0,len(ref_signature)) ]
# Use rownames, one row = one ref feat; will transpose later
rownames = ["ref_feat_"+ str(ref_signature.index(x)+1) for x in ref_signature]
# len (list of ref feats) = number of rows
# LOOP OVER FEATURES OF REFERENCE SIGNATURE
print("Info from file '" +str(ctrl_case_contrast[i] + "' retrieved." ))
print("Looping over reference signature features ..")
print("\n")
for j in range(0,len(ref_signature)):
print("reference feature " + str(j + 1) + " of " + str(len(ref_signature)) )
EQUIV_unique[j] = [x for x in equiv_signatures_LIST[j] if x not in ref_signature[j]]
# Add each feature from the ref signature in a sublist
refequiv_every_ref_its_equiv[j].append(ref_signature[j])
refequiv_every_ref_its_equiv[j] = refequiv_every_ref_its_equiv[j] + EQUIV_unique[j]
# Unique but retain order
# Add header name (eg ref_feat_1)in first index of sublist
refequiv_every_ref_its_equiv[j].insert(0, rownames[j])
# Fill in with 0's to make dataframes of equal rows:
# Find length of longest sublist:
length_of_maxlength_sublist = len(max(refequiv_every_ref_its_equiv,key=len))
# Exit forLoop; check if sublists are of same len(); if not, fill in with "NA"s
# Append NA's until length equal in all sublists
for j in range(0,len(ref_signature)):
# DANGER ZONE! while! but it's oke, I checked
while len(refequiv_every_ref_its_equiv[j]) != length_of_maxlength_sublist:
refequiv_every_ref_its_equiv[j].append("NA")
# Create "ref" or "equiv" flag column to features
ref_equiv_flag = ["flag"]+["ref"] + ['equiv'] * (len(refequiv_every_ref_its_equiv[0])-2)
df_ref_equiv_assigned = DataFrame(refequiv_every_ref_its_equiv)
df_ref_equiv_assigned = df_ref_equiv_assigned.T
# Add "ref" or "equiv" flag to features
df_ref_equiv_assigned.insert(0, "flag", ref_equiv_flag)
# Add column to the end of dataframe: average AUC
avgAUC = ["avgAUC"] + [AUC_of_ROC] * ((len(ref_equiv_flag)) - 1)
df_ref_equiv_assigned.insert(len(df_ref_equiv_assigned.columns), "avgAUC", avgAUC)
# Add column to the end of dataframe: min AUC
minAUC = ["minAUC"] + [Lower_AUC_of_ROC_limit] * ((len(ref_equiv_flag)) - 1)
df_ref_equiv_assigned.insert(len(df_ref_equiv_assigned.columns), "minAUC", minAUC )
# Add column to the end of dataframe: MAX AUC
maxAUC = ["maxAUC"] + [Upper_AUC_of_ROC_limit] * ((len(ref_equiv_flag)) - 1)
df_ref_equiv_assigned.insert(len(df_ref_equiv_assigned.columns), "maxAUC", maxAUC )
# Replace header with first row
new_header = df_ref_equiv_assigned.iloc[0] #grab the first row for the header
df_ref_equiv_assigned = df_ref_equiv_assigned[1:] #take the data less the header row
df_ref_equiv_assigned.columns = new_header #set the header row as the df head
if AUC_filtering == 1:
filtering_threshold = df_ref_equiv_assigned['avgAUC'][1]
elif AUC_filtering == 2:
filtering_threshold = df_ref_equiv_assigned['minAUC'][1]
if filtering_threshold > AUC_threshold:
# Write dataframes in csv
df_ref_equiv_assigned.to_csv(output_dir + filename + ".csv", # exact filepath to the filename
sep = ',' , #.csv file
encoding = 'utf-8' ,
header = True ,
index = False
)
print(df_ref_equiv_assigned)
else:
print("Contras " +filename+ " didn't make the AUC threshold")
print("too low AUC at " + str(filtering_threshold))
print("pass and continue with remaining contrasts")
pass
print("\n\nFinished!\n\nYour output files are now stored in the folder:\n" + str(output_dir))
print("_____________ T h e E N D _____________")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment