Last active
December 29, 2015 03:28
-
-
Save vsoch/7607539 to your computer and use it in GitHub Desktop.
searchNdarLocal.py takes an input file (input.txt) with single keywords on each line, and searches a local database of NDAR behavioral data. Output includes: 1) outfile_vars.txt, with questions containing the search term(s) of interests on single lines, followed by an estimate of the min, max, and variable type 2) outfile_data.txt, with subject …
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
""" | |
searchNdar: reads in a set of strings from file, and search behavioral metrics | |
to find questions / subscales that might be of interest | |
python searchNdarLocal.py --o outfile --i /home | |
/vanessa/Documents/Work/NDAR/behavioral -w input.txt | |
--o is the outfile name, without extension | |
--i is the input directory, without a trailing slash | |
--w is the input text file, either in PWD or fullpath | |
Example input file: | |
gaze | |
eye | |
contact | |
""" | |
__author__ = "Vanessa Sochat ([email protected])" | |
__version__ = "$Revision: 1.0 $" | |
__date__ = "$Date: 2013/11/01 $" | |
__license__ = "Python" | |
import os | |
import sys | |
import re | |
import operator | |
from os import listdir | |
from time import sleep | |
from os.path import isfile, join | |
import pandas as pd | |
import numpy as nu | |
debug = False | |
# SUBJECT ---------------------------------------------------------------------------------- | |
# A subject object organizes all metric data by subject | |
class Subject: | |
def __init__(self,N): | |
self.subs = [] # a list of dictionaries of subjects subs[1] = {'metric':'value' } | |
self.subind = dict() # A dictionary to index the list above | |
self.N = N # number of hits (columns from metrics) | |
self.metrics = [] # List of column metrics and indices | |
self.marker = 0 # Holds index of last metric added | |
self.submarker = 0 # Holds index of last sub added | |
self.data = [] # hold metric data | |
self.suball = [] # a list of subjects specific to a metric | |
self.vartype = [] # holds the (guessed) data type of a column | |
self.labels = [] # All metric labels | |
# Assign metrics - each of N metrics needs a consistent index | |
def addMetric(self,metric): | |
# First get all column names | |
colnames = metric.label | |
colvals = metric.data | |
# Now add each column name with metric name to our list | |
for n in range(0,len(colnames)): | |
self.metrics.append(self.marker) | |
self.data.append(colvals[n]) | |
self.suball.append(metric.subid) # Save ALL subids here | |
self.labels.append(colnames[n]) | |
self.marker = self.marker + 1 | |
# Save each subject data to dictionary | |
for s in range(1,len(metric.subid)): | |
idx = self.subind[metric.subid[s]] # Find subject index | |
self.subs[idx][colnames[n]] = colvals[n][s] | |
# addSubject: adds all subjects to dictionary | |
def addSubjects(self,hit): | |
for s in range(1,len(hit.subid)): # For each subject | |
if hit.subid[s] not in self.subind.keys(): # If they aren't added | |
self.subs.append(dict()) | |
self.subs[self.submarker]['id'] = hit.subid[s] # This is the index of the subject | |
self.subind[hit.subid[s]] = self.submarker | |
self.submarker = self.submarker + 1 | |
# When we get here, all subjects and metrics are added, and we have indices. | |
def summaryStats(self,metrics): | |
self.min = nu.zeros(shape=(len(self.metrics),1)) | |
self.max = nu.zeros(shape=(len(self.metrics),1)) | |
self.vartype = ['numerical']*len(self.metrics) | |
#print "Data matrix is size " + str(nu.shape(self.matrix)) | |
print "Estimating summary statistics..." | |
# Iterate through metrics, add subjects based on idx | |
for m in self.metrics: | |
# Grab data | |
print self.labels[m] | |
data = list(self.data[m]) | |
subs = list(self.suball[m]) | |
# Since we convert to list, the first entry is a column header | |
data = data[1:len(data)] # All IDs specific to a particular column metric | |
subs = subs[1:len(subs)] # All subjects that have the metric | |
# Get an estimate of min/max for each column | |
for s in range(0,len(subs)): | |
if isinstance(data[s],basestring): # If it isn't nan | |
#self.matrix[m][subidx] = data[s] | |
if not re.search("[A-za-z]|[-]|[#]|[&]|[/]",data[s]): | |
val = data[s].strip('\'') | |
if self.min[m] > float(val): self.min[m] = float(val) | |
if self.max[m] < float(val): self.max[m] = float(val) | |
# print to file | |
def printSubjects(self,outfile): | |
print "Printing variables to file..." | |
outfcols = open(outfile + "_vars.txt",'w') # single column of column names | |
outf = open(outfile + "_data.txt",'w') # the data | |
# First print metric_column names | |
outf.write("subjectkey\t") | |
for ee in self.metrics: | |
outf.write(self.labels[ee] + "\t") | |
outf.write("\n") | |
print "There are " + str(len(self.metrics)) + " data items total for " + str(len(self.subs)) + " subjects" | |
# Print subject data | |
for ee in self.metrics: | |
label = self.labels[ee] | |
# This prints the min and max to file, or "text" data if they are both zero | |
if self.min[ee] == 0 and self.max[ee] == 0: | |
outfcols.writelines(label + "\t" + "text" + "\n") | |
else: | |
outfcols.writelines(label + "\t" + str(self.min[ee]) + "\t" + str(self.max[ee]) + "\n") | |
# Iterate through subjects, and then metrics, to print rows | |
for s in range(0,len(self.subs)): | |
subdata = self.subs[s] | |
name = subdata['id'] | |
print "Printing subject " + name + " " + str(s) + " of " + str(len(self.subs)) | |
outf.write(name + "\t") | |
# Print subject data | |
for ee in self.metrics: | |
label = self.labels[ee] | |
data = list(self.data[ee]); | |
#print data[0]; | |
#data = data[1:len(data)] | |
if label in subdata.keys(): # If the metric is a key in the subject dictionary, get data from there | |
outf.write(str(subdata[label]) + "\t") | |
else: | |
outf.write('NaN' + "\t") | |
outf.write('\n') | |
outfcols.close() | |
outf.close() | |
# METRIC ---------------------------------------------------------------------------------- | |
# A metric object holds results for a particular assessment | |
class Metric: | |
def __init__(self,m,mfullfile,subid): | |
self.name = m # metric file name | |
self.fullfile = mfullfile | |
self.subid = subid # List of subject IDs | |
self.data = [] # List of column data | |
self.label = [] # List of full labels | |
# addColumn: adds a column of data to Metric | |
def addColumn(self,column,coldata,description): | |
self.data.append(coldata) # Save the column data | |
self.label.append(self.name + "_" + column + "_" + description) | |
# INPUT FUNCTIONS ----------------------------------------------------------------------- | |
# Read input words, create regular expressiin | |
def readInput(infile): | |
words = [] | |
filey = open(infile,'r') | |
for f in filey.readlines(): | |
words.append(f.strip('\n')) | |
return words | |
# Create regular expression | |
def createRegexp(words): | |
pattern_strings = [] | |
for w in words: | |
pattern_strings.append(w) | |
pattern_string = '|'.join(pattern_strings) | |
pattern = re.compile(pattern_string) | |
return pattern | |
# Read metrics from NDAR Data Dictionary | |
def readMetrics(indir,pattern): | |
# Get files in directory | |
files = [ f for f in listdir(indir) if isfile(join(indir,f)) ] | |
# This will be a list of metrics to return | |
Metrics = [] | |
# Read in each file, search for pattern | |
for f in files: | |
fullfile = indir + '/' + f | |
if debug: print "File is: " + fullfile | |
# If we find a match, parse entire file | |
if pattern.search(open(fullfile).read()): | |
lines = [] | |
# Now read in csv, and save indexed columns with subject IDs as dictionary | |
df = pd.read_csv(fullfile,delimiter='\t') | |
subjectids = df.subjectkey # Subject ID | |
# First row is variable names, 2nd is descriptions | |
varnames = open(fullfile).readlines()[0] | |
descript = open(fullfile).readlines()[1] | |
colnums = [] # indices to columns to save | |
count = 0 | |
if pattern.search(varnames.lower()) or pattern.search(descript.lower()): | |
# Create a new metric object | |
metric = Metric(f,fullfile,subjectids) | |
# Find column indices with matches | |
for v in (varnames.split('\t')): | |
if pattern.search(v.lower()): | |
colnums.append(count) | |
count = count + 1 | |
count = 0 | |
for d in (descript.split('\t')): | |
if pattern.search(d.lower()): | |
colnums.append(count) | |
count = count + 1 | |
# Only save unique values | |
colnums = list(set(colnums)) | |
# Extract column names and descriptions | |
varnames = varnames.split('\t') | |
descript = descript.split('\t') | |
varnames = [varnames[i] for i in colnums] | |
descript = [descript[i] for i in colnums] | |
# Add columns to metric object | |
for i in range(0,len(varnames)): | |
colname = varnames[i].strip('"') | |
coldata = df[colname] # the column data | |
d = descript[i] # the full description | |
metric.addColumn(colname,coldata,d) # column short name, full data, and description | |
# Save metric to list | |
if len(varnames) is not 0: Metrics.append(metric) | |
return Metrics | |
# Print results to file - hits is a list of Metric objects | |
def printResults(hits, outfile): | |
# First get the number of hits per assessment - each hit gets its own column | |
N = 0 # The number of hits, across all metrics | |
for h in hits: | |
N = N + len(h.data) | |
print "Found " + str(N) + " column hits." | |
S = Subject(N) # A dict of subject indices, each w/ dict that indexes by metric/col name | |
for h in hits: | |
S.addSubjects(h) # First add the subjects | |
S.addMetric(h) # Now add metric | |
print "Number of metrics added is " + str(len(hits)) | |
# Calculate mins and maxes | |
S.summaryStats(hits) | |
# Now print to file | |
S.printSubjects(outfile) | |
# MAIN ---------------------------------------------------------------------------------- | |
def usage(): | |
print __doc__ | |
def main(argv): | |
# We need an output file name, and an input file with words | |
if len(argv) < 3: usage(); sys.exit() | |
outfile = None | |
# First cycle through the arguments to collect user variables | |
for ar in range(0,len(argv)): | |
if argv[ar] in ("-h", "--help"): usage(); sys.exit() | |
elif argv[ar] == "--i": infile = argv[ar+1]; | |
elif argv[ar] == "--o": outfile = str(argv[ar+1]) | |
elif argv[ar] == "-i": infile = argv[ar+1]; | |
elif argv[ar] == "-o": outfile = str(argv[ar+1]) | |
elif argv[ar] == "-w": wordfile = argv[ar+1]; | |
elif argv[ar] == "--w": wordfile = str(argv[ar+1]) | |
# If user has not specified an output file name, we will use input file name | |
if not outfile: outfile = os.path.basename(infile).split(".")[0] | |
# Make sure variables are ok | |
print "searchNdarLocal" | |
print "-----------------------------------------------------------" | |
print "Input file directory is: " + infile | |
print "Word file is " + wordfile | |
print "Output file name is " + outfile | |
# Read words to search for in NDAR behavioral metrics | |
print('Reading input files...') | |
words = readInput(wordfile) | |
# Create regular expression | |
pattern = createRegexp(words) | |
# Get all metrics from NDAR Data Dictionary | |
metrics = readMetrics(infile,pattern) | |
# Save and print metrics based on subject ID | |
printResults(metrics,outfile) | |
if __name__ == "__main__": | |
main(sys.argv[1:]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment