Created
March 30, 2015 13:19
-
-
Save anonymous/d3d3bab45d7ae741e37d to your computer and use it in GitHub Desktop.
Analyze your Health app data using R
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
# Setting up all required elements | |
setwd('~/Documents/Health') | |
options(stringsAsFactors=FALSE) | |
Sys.setlocale(category="LC_ALL", locale = "en_US.UTF-8") | |
library(XML) | |
library(ggplot2) | |
##################### | |
# PREPROCESSING # | |
##################### | |
# It's possible to meet whitespaced tag attributes, that ruins xml-parser. | |
# So first let's just read xml file as text. | |
xmlText <- paste(readLines('export.xml'), '\n', collapse='') | |
# But the true way is to make script, looking for spaced tag attributes | |
# Here is the regexpr magic: | |
reg.tags <- gregexpr('<[^>]+>', xmlText)[[1]] | |
tags.to.replace <- vector() | |
for(i in 1:length(reg.tags)){ | |
tag <- substr(xmlText, reg.tags[i], reg.tags[i]+attributes(reg.tags)$match.length[i]-1) | |
if(grepl('"', tag)){ | |
tag <- substr(tag, gregexpr(' ', tag)[[1]][1]+1, nchar(tag)-2) | |
for(attrib in strsplit(tag, '" ')[[1]]){ | |
attrib <- substr(attrib, 1, gregexpr('=', attrib)[[1]][1]-1) | |
if(grepl(' ', attrib)){ | |
tags.to.replace <- append(tags.to.replace, attrib) | |
} | |
} | |
} | |
} | |
tags.to.replace <- unique(tags.to.replace) | |
for(tag in tags.to.replace){ | |
xmlText <- gsub(tag, gsub(' ', '.', tag), xmlText) | |
} | |
# Now we are ready to parse xml | |
data <- xmlTreeParse(xmlText, asText=TRUE) | |
# Now let's find all possible record types, | |
# and produce appropriate data.frames for every type of record | |
data <- xmlToList(data$doc$children[[1]]) | |
data <- data[names(data) == 'Record'] | |
dfs <- list() | |
for(line in data){ | |
if(line['type'] %in% names(dfs)){ | |
dfs[[line['type']]] <- rbind(dfs[[line['type']]], line) | |
}else{ | |
dfs[[line['type']]] = rbind(data.frame(), as.list(line)) | |
} | |
} | |
# Next step: prettifying data.frames | |
for(df.name in names(dfs)){ | |
# Removing column with data.frame name | |
if('type' %in% names(dfs[[df.name]])){ | |
dfs[[df.name]] <- dfs[[df.name]][,-which(names(dfs[[df.name]]) == 'type')] | |
} | |
# Converting dates to POSIXlt format | |
for(var in c('creationDate', 'startDate', 'endDate')){ | |
if(var %in% names(dfs[[df.name]])){ | |
dfs[[df.name]][,var] <- as.POSIXct(strptime(dfs[[df.name]][,var], format='%Y%m%d%H%M%S%z')) | |
} | |
} | |
# Converting numeric values to numbers | |
for(var in c('value', 'min', 'max', 'average')){ | |
if(var %in% names(dfs[[df.name]])){ | |
dfs[[df.name]][,var] <- as.numeric(dfs[[df.name]][,var]) | |
} | |
} | |
if('recordCount' %in% names(dfs[[df.name]])){ | |
dfs[[df.name]]$recordCount <- as.integer(dfs[[df.name]]$recordCount) | |
} | |
# Converting sources and units to factors | |
for(var in c('source', 'unit')){ | |
if(var %in% names(dfs[[df.name]])){ | |
dfs[[df.name]][,var] <- factor(dfs[[df.name]][,var]) | |
} | |
} | |
} | |
##################### | |
# ANALYSIS # | |
##################### | |
# Let's print names and sizes for available datasets | |
stat <- as.data.frame(do.call('rbind', lapply(names(dfs), function(x) list(parameter = substr(x, 25, nchar(x)), records = nrow(dfs[[x]]))))) | |
print(stat) | |
# Plot Heart Rate dynamics | |
df <- dfs$HKQuantityTypeIdentifierHeartRate | |
# Convert units | |
df$min <- as.integer(df$min*60) | |
df$max <- as.integer(df$max*60) | |
df$average <- as.integer(df$average*60) | |
df$unit <- 'count/min' | |
# Remove outliers | |
df <- df[abs(df$average - mean(df$average)) < 3*sd(df$average), ] | |
p <- ggplot(df, aes(x=startDate, y=average)) | |
p <- p + geom_point(aes(color=source)) | |
p <- p + geom_smooth(method='loess') | |
p <- p + scale_color_brewer(palette='Dark2') | |
p <- p + labs(title='Heart Rate Dynamics', x='Date', y='Beats per min') | |
p <- p + theme_bw() | |
p |
The "regexpr magic" starting on line 20 does not work for me. My XML file from Apple Health is about 30mb, and this regexpr magic is running in R for over an hour now. Sometimes I get this error: internal error: Huge input lookup.
How can I fix this?
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Great! Thank You!