Last active
March 15, 2019 10:11
-
-
Save walterst/ca4a41d32cceba809c77b55fc2c068cc to your computer and use it in GitHub Desktop.
Custom script used to parse tab delimited Ipod data, match up dates from tab-delimited QIIME mapping data, and write averages of data from multiple days on and prior to qiime metadata samples as metadata columns. This script uses a QIIME 1.9X environment for the parse_mapping_file function.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
from __future__ import division | |
# USAGE: python parse_ipod_to_metadata.py mapping_file days_to_consider ipod_tab_delim_file raw_output_file qiime_compatible_output_file | |
# where days_to_consider counts the same-day as one of the days, and comma-seperated columns needs to be | |
# an exact match to the field label in the ipod data file, e.g. Gastrointestinal_issues | |
# All dates must be in the format of DD/MM/YY in the ipod source tab delimited data. | |
from sys import argv | |
from operator import itemgetter | |
from datetime import datetime, date | |
from numpy import array, mean | |
from qiime.parse import parse_mapping_file | |
# For now, a subset of the data, until all data can be quantified for parsing | |
target_fields = ["What time did you wake up today? (please use military time)", | |
"How much of your breakfast did you eat?", | |
"How much of your lunch did you eat?", | |
"How much of it did you eat?", | |
"Bottled water", | |
"Base purified water", | |
"Tap water", | |
"Soda", | |
"Sports energy drink (e.g. Gatorade)", | |
"Coffee or tea", | |
"Milk", | |
"Fruit juice", | |
"Yogurt", | |
"Beer, wine, or spirits", | |
"How are you feeling today?", | |
"Fever", | |
"Gastrointestinal issues", | |
"Headache", | |
"Tiredness", | |
"Runny or blocked nose", | |
"Rash", | |
"Muscle strain", | |
"Cramp", | |
"How much exercise did you do?", | |
"How many times did you urinate today?", | |
"How many times did you have a bowel movement today?", | |
"Please describe the consistency of your stool.", | |
"What time did you get into bed before going to sleep today? (please use military time)" | |
] | |
portions_numeric = { | |
# all empty fields become NA | |
"":"NA", | |
# what fraction was eaten at breakfast, lunch, or dinner | |
"All of it":"4", | |
"3/4 of it":"3", | |
"1/2 of it":"2", | |
"1/4 of it":"1", | |
# reported wellness, from "How are you feeling today?" | |
"Good":"3", | |
"Ill":"2", | |
"Very Ill":"1", | |
# Exercise duration | |
"Less than 30 mins":"1", | |
"30 mins to 1 hour":"2", | |
"1 to 2 hours":"3", | |
"2+ hours":"4", | |
# Urination frequency | |
"1-2":"1.5", | |
"3-5":"4", | |
"6-9":"7.5", | |
"9+":"9", | |
# Bowel movement frequency | |
"0":"0", | |
"1":"1", | |
"2":"2", | |
"3":"3", | |
"4":"4", | |
"5":"5", | |
"6+":"6", | |
# Stool consistency | |
"Hard and formed (like a cigar)":"1", | |
"Soft and formed (like peanut butter)":"2", | |
"Loose and unformed (like a thick milkshake)":"3", | |
"Liquid (like water)":"4" | |
} | |
# for categories with only one item, just set to 1 or 0 | |
presence_absence = { | |
"":"0", | |
"Bottled water":"1", | |
"Base purified water":"1", | |
"Tap water":"1", | |
"Soda":"1", | |
"Sports energy drink (e.g. Gatorade)":"1", | |
"Coffee or tea":"1", | |
"Milk":"1", | |
"Fruit juice":"1", | |
"Yogurt":"1", | |
"Beer, wine, or spirits":"1", | |
"Fever":"1", | |
"Gastrointestinal issues":"1", | |
"Headache":"1", | |
"Tiredness":"1", | |
"Runny or blocked nose":"1", | |
"Rash":"1", | |
"Muscle strain":"1", | |
"Cramp":"1", | |
} | |
target_fields_lookup = [None, | |
portions_numeric, | |
portions_numeric, | |
portions_numeric, | |
presence_absence, | |
presence_absence, | |
presence_absence, | |
presence_absence, | |
presence_absence, | |
presence_absence, | |
presence_absence, | |
presence_absence, | |
presence_absence, | |
presence_absence, | |
portions_numeric, | |
presence_absence, | |
presence_absence, | |
presence_absence, | |
presence_absence, | |
presence_absence, | |
presence_absence, | |
presence_absence, | |
presence_absence, | |
portions_numeric, | |
portions_numeric, | |
portions_numeric, | |
portions_numeric, | |
None | |
] | |
mapping_f = argv[1] | |
mapping_data, mapping_headers, _ = parse_mapping_file(open(mapping_f, 'U')) | |
days_considered = int(argv[2]) | |
ipod_f = open(argv[3], "U") | |
raw_data_outf = open(argv[4], "w") | |
qiime_data_outf = open(argv[5], "w") | |
qiime_data_outf.write("#") | |
target_field_ixs = [] | |
ipod_data = {} | |
""" Attempting this approach to organizing: | |
Data will be loaded from ipod touch form as a dictionary with tuples as keys of: | |
(4 digit ID code, ordinal date int value):[list of strip()ed data from tsv ipod specified headers] | |
Will have to query the dictionary for the presence of each id,date from the mapping file, | |
have KeyError exceptions to indicate missing data. | |
""" | |
# 2 samples from Turkey in ipod survey, also extra PHR sample, ignoring these for now | |
ignore_ids = ["2002","2004","2005","PHR"] | |
counter = 0 | |
for line in ipod_f: | |
curr_line = line.replace("\n","").split("\t") | |
if len(curr_line) == 0: | |
continue | |
if line.startswith("#"): | |
User_ix = curr_line.index("User") | |
date_ipod_ix = curr_line.index("CorrectedDateDDMMYY") | |
for curr_field in target_fields: | |
target_field_ixs.append(curr_line.index(curr_field)) | |
continue | |
curr_id = curr_line[User_ix].strip() | |
# IF date is NA, or if in IDs to skip listed above, skip appending data | |
if(curr_line[date_ipod_ix].strip() == "NA" or curr_id in ignore_ids): | |
continue | |
curr_date = datetime.strptime(curr_line[date_ipod_ix].strip(), '%d/%m/%y').date().toordinal() | |
curr_added_data = [] | |
for n in target_field_ixs: | |
curr_lookup = target_fields_lookup[target_field_ixs.index(n)] | |
if curr_lookup: | |
curr_val = curr_lookup[curr_line[n].strip()] | |
else: | |
curr_val = curr_line[n].strip() | |
# Set empty fields to NA | |
if len(curr_val) == 0: | |
curr_val = "NA" | |
curr_added_data.append(curr_val) | |
ipod_data[(curr_id,curr_date)] = curr_added_data | |
date_mapping_ix = mapping_headers.index("SampleDate") | |
numeric_id_ix = mapping_headers.index("Numeric_ID") | |
# Build up dict of tuples of (id,ordinal date), skip inclusion if NA in either position. | |
id_dates_in_mapping = {} | |
# Built up similar dict of tuples as above, but for handling hours slept calculation | |
id_dates_in_mapping_sleep = {} | |
# Build up this data, so can be output later along with ipod data | |
metadata_line_in_mapping = {} | |
for line in mapping_data: | |
if line[date_mapping_ix] == "NA" or line[numeric_id_ix] == "NA": | |
continue | |
curr_date = datetime.strptime(line[date_mapping_ix].strip(), '%d/%m/%y').date().toordinal() | |
id_dates_in_mapping[line[numeric_id_ix].strip(), curr_date] = [] | |
id_dates_in_mapping_sleep[line[numeric_id_ix].strip(), curr_date] = [] | |
metadata_line_in_mapping[line[numeric_id_ix].strip(), curr_date] = line | |
# For each id:date key in dict, make list of target ids:dates to query from ipod data | |
# takes values from curr_date - (0 to days_considered) to find appropriate ordinal values | |
for curr_key in id_dates_in_mapping: | |
for n in range(0, days_considered): | |
id_dates_in_mapping[curr_key].append((curr_key[0], curr_key[1] - n)) | |
# Copying this, but increasing by range by 1 to handle the extra day need for hours | |
# slept calculation | |
for curr_key in id_dates_in_mapping_sleep: | |
for n in range(0, days_considered + 1): | |
id_dates_in_mapping_sleep[curr_key].append((curr_key[0], curr_key[1] - n)) | |
# Insert metadata headers considered into the end of the headers, before the last Description column | |
target_fields.append("Time_Slept") | |
corrected_headers = mapping_headers | |
for curr_header in target_fields: | |
corrected_headers.insert(-1, curr_header) | |
raw_data = [corrected_headers] | |
qiime_data = [corrected_headers] | |
# Try to calculate the hours slept for each day (based upon prior day's time to sleep), | |
# add to data as additional field | |
awake_ix = 0 | |
asleep_ix = -1 | |
ipod_sleeping_hours = {} | |
count_missing_firstix = 0 | |
count_missing_secondix = 0 | |
for curr_key in id_dates_in_mapping_sleep: | |
""" Explanation here for this-the data are sorted backwards, with the newest day | |
first in the list being indexed. To do the time-time comparison for sleeping and | |
awakening, each day is going to be queried, along with the next one in the list, | |
and if both data exist, then convert the datetime object with year/month/date/hour, | |
using the caveat that if the value of the hour is after midnight but before 11:30, | |
increment the day by 1""" | |
for curr_id_date_ix in range(len(id_dates_in_mapping_sleep[curr_key]) - 1): | |
# Will often be empty, so have to do try/except commands | |
try: | |
curr_awake = ipod_data[id_dates_in_mapping_sleep[curr_key][curr_id_date_ix]][awake_ix] | |
except KeyError: | |
ipod_sleeping_hours[id_dates_in_mapping_sleep[curr_key][curr_id_date_ix]] = "NA" | |
continue | |
try: | |
curr_asleep = ipod_data[id_dates_in_mapping_sleep[curr_key][curr_id_date_ix+1]][asleep_ix] | |
except KeyError: | |
ipod_sleeping_hours[id_dates_in_mapping_sleep[curr_key][curr_id_date_ix]] = "NA" | |
continue | |
# If either field has NA value, continue, should not be many of these | |
if curr_awake == "NA" or curr_asleep == "NA": | |
ipod_sleeping_hours[id_dates_in_mapping_sleep[curr_key][curr_id_date_ix]] = "NA" | |
continue | |
if int(curr_asleep) >= 0 and int(curr_asleep) <= 1130: | |
inc_date = 1 | |
else: | |
inc_date = 0 | |
# Get the ordinal values back into year, month, day, correct the date if went to sleep in the AM | |
datetime_awake = date.fromordinal(id_dates_in_mapping_sleep[curr_key][curr_id_date_ix][1]) | |
datetime_asleep = date.fromordinal(id_dates_in_mapping_sleep[curr_key][curr_id_date_ix+1][1] + inc_date) | |
# Might be a more elegant way to handle this, but going to slice the time | |
# based upon the number of digits present to get hours and minutes. | |
if len(curr_awake) == 4: | |
awake_hours = int(curr_awake[0:2]) | |
awake_mins = int(curr_awake[2:]) | |
elif len(curr_awake) == 3: | |
awake_hours = int(curr_awake[0:1]) | |
awake_mins = int(curr_awake[1:]) | |
else: | |
awake_hours = 0 | |
awake_mins = int(curr_awake) | |
if len(curr_asleep) == 4: | |
asleep_hours = int(curr_asleep[0:2]) | |
asleep_mins = int(curr_asleep[2:]) | |
elif len(curr_asleep) == 3: | |
asleep_hours = int(curr_asleep[0:1]) | |
asleep_mins = int(curr_asleep[1:]) | |
else: | |
asleep_hours = 0 | |
asleep_mins = int(curr_asleep) | |
converted_awake = datetime(datetime_awake.year, datetime_awake.month, | |
datetime_awake.day, awake_hours, awake_mins) | |
converted_asleep = datetime(datetime_asleep.year, datetime_asleep.month, | |
datetime_asleep.day, asleep_hours, asleep_mins) | |
time_diff = converted_awake-converted_asleep | |
time_slept_hours = time_diff.seconds/3600 | |
# add the slept hours to the ipod data as another field, but, need to build up | |
# this data and add it outside of this loop, since we're still reading in the | |
# ipod touch date during this loop | |
ipod_sleeping_hours[id_dates_in_mapping_sleep[curr_key][curr_id_date_ix]] = "%2.2f" % time_slept_hours | |
#hours_added = True | |
#if not hours_added: | |
#print curr_key | |
#ipod_sleeping_hours.append((id_dates_in_mapping_sleep[curr_key][curr_id_date_ix], | |
# "NA")) | |
ipod_data_keys = set(ipod_data.keys()) | |
for curr_indexdata in ipod_sleeping_hours: | |
# All empty fields were given "NA" above, this just fills in subset to match the | |
# actual data in ipod_data, needed for later parsing of the data | |
if curr_indexdata in ipod_data_keys: | |
ipod_data[curr_indexdata].append(ipod_sleeping_hours[curr_indexdata]) | |
#for x in ipod_data: | |
# print ipod_data[x] | |
# Now to query ipod data for each target id:date combo | |
for curr_key in id_dates_in_mapping: | |
target_vals = [] | |
queried_dates = [] | |
average_vals = [] | |
fill_empty_fields = True # If no data found in date range, use this to fill empty data in final mapping | |
for curr_id_date in id_dates_in_mapping[curr_key]: | |
# Will often be empty, so have to do try/except commands | |
try: | |
target_vals.append(ipod_data[curr_id_date]) | |
except KeyError: | |
continue | |
queried_dates.append("%s" % date.fromordinal(curr_id_date[1])) | |
fill_empty_fields = False | |
# Transpose data, needed to step through values, ignore NA or empty fields | |
transposed_vals = map(list, zip(*target_vals)) | |
for n in transposed_vals: | |
curr_vals = [] | |
for x in n: | |
if x == "NA" or x == '': | |
continue | |
curr_vals.append(float(x)) | |
# If empty, put "NA" in the field, else put average of values | |
if len(curr_vals) == 0: | |
average_vals.append("NA") | |
else: | |
average_vals.append("%4.2f" % (mean(array(curr_vals)))) | |
if fill_empty_fields: | |
average_vals = ["NA"] * len(target_fields) | |
# Write out raw data to log file | |
raw_data_outf.write("****************************\n") | |
raw_data_outf.write("4 digit ID and date for current date: %s,%s\n" % (curr_key[0], date.fromordinal(curr_key[1]))) | |
raw_data_outf.write("Dates from IPOD data queried: %s \n" % queried_dates) | |
raw_data_outf.write("Headers queried: %s \n" % ",".join(target_fields)) | |
raw_data_outf.write("Raw values for each category: %s \n" % transposed_vals) | |
raw_data_outf.write("Averaged values for each category: %s \n" % average_vals) | |
raw_data_outf.write("Mapping metadata line associated with the above values: %s \n" % "\t".join(metadata_line_in_mapping[curr_key])) | |
# Add data to metadata lines, write out to qiime-formatted file | |
curr_metadata_line = metadata_line_in_mapping[curr_key] | |
for curr_average in average_vals: | |
curr_metadata_line.insert(-1, curr_average) | |
qiime_data.append(curr_metadata_line) | |
for line in qiime_data: | |
qiime_data_outf.write("\t".join(line)) | |
qiime_data_outf.write('\n') | |
""" | |
list.insert(location or -1, value) for inserting data before the end. | |
If no data are available, put NA in the field. | |
""" | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment