Created
June 8, 2016 00:05
-
-
Save cbare/498f537c6417e16939bbdf75a552d285 to your computer and use it in GitHub Desktop.
Assess the 4 selected features for the mPower dashboard, committing some fun R data geekery in the process
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
--- | |
title: "Directionality of mPower features" | |
author: "J. Christopher Bare" | |
date: "June 6, 2016" | |
--- | |
```{r imports, echo=FALSE, message=FALSE, warning=FALSE} | |
library(synapseClient) | |
library(ggplot2) | |
library(scales) | |
synapseLogin() | |
``` | |
Define functions to read activity meta-data and feature table, perform age-matching, and make plots comparing case and control populations. | |
```{r define_functions, echo=FALSE} | |
read_activity_data <- function(activity_table_synid) { | |
q <- sprintf('select healthCode, recordId, medTimepoint, createdOn from %s', activity_table_synid) | |
results <- synTableQuery(q) | |
activity <- results@values | |
} | |
read_feature_table <- function(feature_table_synid) { | |
q <- sprintf('select * from %s', feature_table_synid) | |
results <- synTableQuery(q) | |
features <- results@values | |
} | |
merge_feature_data <- function(demo, activity, features) { | |
data <- merge(activity, features, by='recordId') | |
data <- merge(demo, data, by='healthCode') | |
} | |
get_feature_data <- function(demo, activity_table_synid, feature_table_synid) { | |
activity <- read_activity_data(activity_table_synid) | |
features <- read_feature_table(feature_table_synid) | |
data <- merge_feature_data(demo, activity, features) | |
} | |
plot_case_vs_control <- function(data, activity_name, feature_name, ...) { | |
ellipsis_args <- list(...) | |
p <- ggplot(data, aes_string(x="age", y=feature_name, | |
color="category", | |
shape="category")) + | |
geom_point(alpha=1/5, position=position_jitter(width=1)) + | |
geom_smooth(method="lm") + | |
geom_smooth(method="lm", color=alpha("black",0.3)) + | |
ggtitle(sprintf("%s vs age", feature_name)) | |
## Note that we have to carefully use coord_cartesian(ylim=...) here | |
## because setting ylim by itself causes ggplot to (helpfully??) | |
## leave data points to be left out of the regression. | |
if ('feature_range_limit' %in% names(ellipsis_args)) { | |
p <- p + coord_cartesian(ylim=ellipsis_args$feature_range_limit) | |
} | |
print(p) | |
## label the boxplots with their median value | |
medians <- data.frame(category=unique(data$category), | |
med=sapply(unique(data$category), function(cat) { | |
median(data[[feature_name]][data$category==cat]) | |
})) | |
p <- ggplot(data, aes_string(x="category", | |
y=feature_name, | |
fill="category")) + | |
geom_boxplot() + | |
theme(legend.position="none") + | |
ggtitle(sprintf("Case vs control %s", feature_name)) + | |
geom_text(data=medians, aes(x=category, y=med, label=med), size=3, vjust=-1.5) | |
if ('feature_range_limit' %in% names(ellipsis_args)) { | |
p <- p + coord_cartesian(ylim=ellipsis_args$feature_range_limit) | |
} | |
print(p) | |
p <- ggplot(data, aes_string(feature_name, | |
fill="category", | |
color="category")) + | |
geom_density(alpha = 0.1) + | |
ggtitle(sprintf("Case vs control %s", feature_name)) | |
if ('feature_range_limit' %in% names(ellipsis_args)) { | |
p <- p + coord_cartesian(xlim=ellipsis_args$feature_range_limit) | |
} | |
print(p) | |
} | |
resample_controls_matching_age <- function(data, width=5, min_age=0) { | |
cases <- data[ data$`professional-diagnosis` & data$age>=min_age, ] | |
controls <- data[ !data$`professional-diagnosis`, ] | |
## create index from age to controls of similar age | |
index <- lapply(min(cases$age):max(cases$age), function(age) { | |
if (age > 80) { | |
which(controls$age > 80) | |
} | |
else { | |
which(controls$age >= age-width & controls$age <= age+width) | |
} | |
}) | |
names(index) <- min(cases$age):max(cases$age) | |
resampled_control_indices <- sapply(sample(cases$age, nrow(cases), replace=TRUE), | |
function (age) { | |
sample(index[[as.character(age)]], 1) | |
}) | |
resampled_controls <- controls[resampled_control_indices,] | |
rbind(cases, resampled_controls) | |
} | |
extract_age_band <- function(data, sds=2) { | |
cases <- data[ data$`professional-diagnosis`, ] | |
controls <- data[ !data$`professional-diagnosis`, ] | |
lower_bound <- median(cases$age) - sds*sd(cases$age) | |
upper_bound <- median(cases$age) + sds*sd(cases$age) | |
cases_in_age_band <- cases[cases$age >= lower_bound & cases$age <= upper_bound, ] | |
controls_in_age_band <- controls[controls$age >= lower_bound & controls$age <= upper_bound, ] | |
rbind(cases_in_age_band, controls_in_age_band) | |
} | |
label_categories <- function(data) { | |
medTimepointStrings <- c( | |
'Immediately before Parkinson medication', | |
'Immediately before taking Parkinson medication', | |
'Just after Parkinson medication (at your best)', | |
'Just after taking Parkinson medication (at your best)') | |
pre <- medTimepointStrings[1:2] | |
post <- medTimepointStrings[3:4] | |
## label the three remaining categories | |
data$category <- ifelse(data$`professional-diagnosis`, 'case', 'control') | |
data$category[data$`professional-diagnosis` & data$medTimepoint %in% pre] <- 'case pre' | |
data$category[data$`professional-diagnosis` & data$medTimepoint %in% post] <- 'case post' | |
## remove activities of cases that are not at pre or post med time points | |
data <- data[data$category!='case',] | |
## convert category to an ordered factor | |
data$category <- factor(data$category, levels=c("case pre", "case post", "control"), ordered=TRUE) | |
if (length(unique(data$category)) > 3) stop("Unexpected categories: ", unique(data$category)) | |
return(data) | |
} | |
check_data <- function(data) { | |
message(sprintf("dim(data) = %d x %d", nrow(data), ncol(data))) | |
for (colname in colnames(data)) { | |
if (any(is.na(data[[colname]]))) { | |
message(colname, "has NAs") | |
} | |
} | |
print(summary(data)) | |
} | |
t_test_pre_vs_post <- function(data, feature_name) { | |
data$date <- as.Date(data$createdOn) | |
## get rid of weird future dates | |
data <- data[ data$date <= Sys.Date() + 1, ] | |
## split data into pre and post sections and merge on health code and date | |
pre <- data[ data$category=="case pre", c("healthCode", "date", "recordId", "age", feature_name)] | |
post <- data[ data$category=="case post", c("healthCode", "date", "recordId", feature_name)] | |
pp <- merge(pre, post, by=c("healthCode", "date"), suffixes=c(".pre", ".post")) | |
return(t.test(x=pp[[paste0(feature_name, ".pre")]], y=pp[[paste0(feature_name, ".post")]], paired=TRUE)) | |
} | |
t_test_case_vs_control <- function(data, feature_name) { | |
case <- data[ data$`professional-diagnosis`, ] | |
control <- data[ !data$`professional-diagnosis`, ] | |
return(t.test(x=case[[feature_name]], y=control[[feature_name]], paired=FALSE)) | |
} | |
## Compare distribution of activities by participants of a given age (black) | |
## with that of the control population (red) with age matched activities (blue). | |
plot_age_distribution <- function(data, data_age_matched) { | |
a <- 3 | |
plot(density(data_age_matched$age[data_age_matched$`professional-diagnosis`], adjust=a), | |
lwd=2, ylim=c(0,0.06), main="Activity density by participant age", | |
xlab="Age") | |
lines(density(data_age_matched$age[!data_age_matched$`professional-diagnosis`], adjust=a), col='blue') | |
lines(density(data$age[!data$`professional-diagnosis`], adjust=a), col='red') | |
legend(x="topright", col=c("black", "blue", "red"), lty=1, legend=c("case ages", "cntl ages matched", "cntl ages before matching"), cex=0.8) | |
} | |
## Tried two different methods of age matching here, but the results were very | |
## similar. The "band" method is simpler, so let's stick with that. | |
age_match <- function(data, method="band", plot=FALSE) { | |
if (method=="band") { | |
data_age_matched <- extract_age_band(data, sds=2) | |
} | |
else if (method=="resample") { | |
data_age_matched <- resample_controls_matching_age(data, min_age=50, width=2) | |
} | |
else { | |
stop("Unexpected age matching method: ", method) | |
} | |
if (plot) { | |
plot_age_distribution(data, data_age_matched) | |
} | |
data_age_matched | |
} | |
``` | |
Read demographics data. | |
```{r demographics} | |
demographics_synid <- 'syn5762676' | |
q <- sprintf('select healthCode, age, "professional-diagnosis" from %s where age is not null and "professional-diagnosis" is not null', demographics_synid) | |
results <- synTableQuery(q) | |
demo <- results@values | |
``` | |
Specify how to match control ages to case ages. | |
```{r select_age_matching_method} | |
age_matching_method = "band" ## band or resample | |
plot_age_matching = TRUE | |
``` | |
## Balance | |
Compare case pre-medication, post-medication, and control. | |
```{r balance} | |
data <- get_feature_data(demo, | |
activity_table_synid='syn5762682', | |
feature_table_synid='syn5987317') | |
data <- na.omit(data) | |
data <- age_match(data, method=age_matching_method, plot=plot_age_matching) | |
data <- label_categories(data) | |
check_data(data) | |
print(t_test_pre_vs_post(data, feature_name='zcrAA')) | |
print(t_test_case_vs_control(data, feature_name='zcrAA')) | |
plot_case_vs_control(data, | |
activity_name='Balance', | |
feature_name='zcrAA') | |
``` | |
## Gait | |
Compare case pre-medication, post-medication, and control. | |
```{r gait} | |
data <- get_feature_data(demo, | |
activity_table_synid='syn5762682', | |
feature_table_synid='syn5987318') | |
data <- na.omit(data) | |
data <- age_match(data, method=age_matching_method, plot=plot_age_matching) | |
data <- label_categories(data) | |
check_data(data) | |
print(t_test_pre_vs_post(data, feature_name='F0XY')) | |
print(t_test_case_vs_control(data, feature_name='F0XY')) | |
plot_case_vs_control(data, | |
activity_name="Gait", | |
feature_name='F0XY', | |
feature_range_limit=c(0,4)) | |
``` | |
## Tapping | |
Compare case pre-medication, post-medication, and control. | |
```{r tapping} | |
data <- get_feature_data(demo, | |
activity_table_synid='syn5762680', | |
feature_table_synid='syn5987315') | |
data <- na.omit(data) | |
data <- age_match(data, method=age_matching_method, plot=plot_age_matching) | |
data <- label_categories(data) | |
check_data(data) | |
print(t_test_pre_vs_post(data, feature_name='tap_count')) | |
print(t_test_case_vs_control(data, feature_name='tap_count')) | |
plot_case_vs_control(data, | |
activity_name="Tapping", | |
feature_name='tap_count') | |
``` | |
## Voice | |
Compare case pre-medication, post-medication, and control. | |
```{r voice} | |
data <- get_feature_data(demo, | |
activity_table_synid='syn5762681', | |
feature_table_synid='syn5987316') | |
data <- na.omit(data) | |
data <- age_match(data, method=age_matching_method, plot=plot_age_matching) | |
data <- label_categories(data) | |
check_data(data) | |
print(t_test_pre_vs_post(data, feature_name='medianF0')) | |
print(t_test_case_vs_control(data, feature_name='medianF0')) | |
plot_case_vs_control(data, | |
activity_name="Voice", | |
feature_name='medianF0') | |
``` | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment