Skip to content

Instantly share code, notes, and snippets.

Created March 30, 2015 13:19
Show Gist options
  • Save anonymous/d3d3bab45d7ae741e37d to your computer and use it in GitHub Desktop.
Save anonymous/d3d3bab45d7ae741e37d to your computer and use it in GitHub Desktop.
Analyze your Health app data using R
# -*- coding: utf-8 -*-
# Setting up all required elements
setwd('~/Documents/Health')
options(stringsAsFactors=FALSE)
Sys.setlocale(category="LC_ALL", locale = "en_US.UTF-8")
library(XML)
library(ggplot2)
#####################
# PREPROCESSING #
#####################
# It's possible to meet whitespaced tag attributes, that ruins xml-parser.
# So first let's just read xml file as text.
xmlText <- paste(readLines('export.xml'), '\n', collapse='')
# But the true way is to make script, looking for spaced tag attributes
# Here is the regexpr magic:
reg.tags <- gregexpr('<[^>]+>', xmlText)[[1]]
tags.to.replace <- vector()
for(i in 1:length(reg.tags)){
tag <- substr(xmlText, reg.tags[i], reg.tags[i]+attributes(reg.tags)$match.length[i]-1)
if(grepl('"', tag)){
tag <- substr(tag, gregexpr(' ', tag)[[1]][1]+1, nchar(tag)-2)
for(attrib in strsplit(tag, '" ')[[1]]){
attrib <- substr(attrib, 1, gregexpr('=', attrib)[[1]][1]-1)
if(grepl(' ', attrib)){
tags.to.replace <- append(tags.to.replace, attrib)
}
}
}
}
tags.to.replace <- unique(tags.to.replace)
for(tag in tags.to.replace){
xmlText <- gsub(tag, gsub(' ', '.', tag), xmlText)
}
# Now we are ready to parse xml
data <- xmlTreeParse(xmlText, asText=TRUE)
# Now let's find all possible record types,
# and produce appropriate data.frames for every type of record
data <- xmlToList(data$doc$children[[1]])
data <- data[names(data) == 'Record']
dfs <- list()
for(line in data){
if(line['type'] %in% names(dfs)){
dfs[[line['type']]] <- rbind(dfs[[line['type']]], line)
}else{
dfs[[line['type']]] = rbind(data.frame(), as.list(line))
}
}
# Next step: prettifying data.frames
for(df.name in names(dfs)){
# Removing column with data.frame name
if('type' %in% names(dfs[[df.name]])){
dfs[[df.name]] <- dfs[[df.name]][,-which(names(dfs[[df.name]]) == 'type')]
}
# Converting dates to POSIXlt format
for(var in c('creationDate', 'startDate', 'endDate')){
if(var %in% names(dfs[[df.name]])){
dfs[[df.name]][,var] <- as.POSIXct(strptime(dfs[[df.name]][,var], format='%Y%m%d%H%M%S%z'))
}
}
# Converting numeric values to numbers
for(var in c('value', 'min', 'max', 'average')){
if(var %in% names(dfs[[df.name]])){
dfs[[df.name]][,var] <- as.numeric(dfs[[df.name]][,var])
}
}
if('recordCount' %in% names(dfs[[df.name]])){
dfs[[df.name]]$recordCount <- as.integer(dfs[[df.name]]$recordCount)
}
# Converting sources and units to factors
for(var in c('source', 'unit')){
if(var %in% names(dfs[[df.name]])){
dfs[[df.name]][,var] <- factor(dfs[[df.name]][,var])
}
}
}
#####################
# ANALYSIS #
#####################
# Let's print names and sizes for available datasets
stat <- as.data.frame(do.call('rbind', lapply(names(dfs), function(x) list(parameter = substr(x, 25, nchar(x)), records = nrow(dfs[[x]])))))
print(stat)
# Plot Heart Rate dynamics
df <- dfs$HKQuantityTypeIdentifierHeartRate
# Convert units
df$min <- as.integer(df$min*60)
df$max <- as.integer(df$max*60)
df$average <- as.integer(df$average*60)
df$unit <- 'count/min'
# Remove outliers
df <- df[abs(df$average - mean(df$average)) < 3*sd(df$average), ]
p <- ggplot(df, aes(x=startDate, y=average))
p <- p + geom_point(aes(color=source))
p <- p + geom_smooth(method='loess')
p <- p + scale_color_brewer(palette='Dark2')
p <- p + labs(title='Heart Rate Dynamics', x='Date', y='Beats per min')
p <- p + theme_bw()
p
@dkuemper
Copy link

Great! Thank You!

@maxwxyz
Copy link

maxwxyz commented Apr 21, 2016

The "regexpr magic" starting on line 20 does not work for me. My XML file from Apple Health is about 30mb, and this regexpr magic is running in R for over an hour now. Sometimes I get this error: internal error: Huge input lookup.
How can I fix this?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment