Skip to content

Instantly share code, notes, and snippets.

@cbare
Created June 8, 2016 00:05
Show Gist options
  • Save cbare/498f537c6417e16939bbdf75a552d285 to your computer and use it in GitHub Desktop.
Save cbare/498f537c6417e16939bbdf75a552d285 to your computer and use it in GitHub Desktop.
Assess the 4 selected features for the mPower dashboard, committing some fun R data geekery in the process
---
title: "Directionality of mPower features"
author: "J. Christopher Bare"
date: "June 6, 2016"
---
```{r imports, echo=FALSE, message=FALSE, warning=FALSE}
library(synapseClient)
library(ggplot2)
library(scales)
synapseLogin()
```
Define functions to read activity meta-data and feature table, perform age-matching, and make plots comparing case and control populations.
```{r define_functions, echo=FALSE}
read_activity_data <- function(activity_table_synid) {
q <- sprintf('select healthCode, recordId, medTimepoint, createdOn from %s', activity_table_synid)
results <- synTableQuery(q)
activity <- results@values
}
read_feature_table <- function(feature_table_synid) {
q <- sprintf('select * from %s', feature_table_synid)
results <- synTableQuery(q)
features <- results@values
}
merge_feature_data <- function(demo, activity, features) {
data <- merge(activity, features, by='recordId')
data <- merge(demo, data, by='healthCode')
}
get_feature_data <- function(demo, activity_table_synid, feature_table_synid) {
activity <- read_activity_data(activity_table_synid)
features <- read_feature_table(feature_table_synid)
data <- merge_feature_data(demo, activity, features)
}
plot_case_vs_control <- function(data, activity_name, feature_name, ...) {
ellipsis_args <- list(...)
p <- ggplot(data, aes_string(x="age", y=feature_name,
color="category",
shape="category")) +
geom_point(alpha=1/5, position=position_jitter(width=1)) +
geom_smooth(method="lm") +
geom_smooth(method="lm", color=alpha("black",0.3)) +
ggtitle(sprintf("%s vs age", feature_name))
## Note that we have to carefully use coord_cartesian(ylim=...) here
## because setting ylim by itself causes ggplot to (helpfully??)
## leave data points to be left out of the regression.
if ('feature_range_limit' %in% names(ellipsis_args)) {
p <- p + coord_cartesian(ylim=ellipsis_args$feature_range_limit)
}
print(p)
## label the boxplots with their median value
medians <- data.frame(category=unique(data$category),
med=sapply(unique(data$category), function(cat) {
median(data[[feature_name]][data$category==cat])
}))
p <- ggplot(data, aes_string(x="category",
y=feature_name,
fill="category")) +
geom_boxplot() +
theme(legend.position="none") +
ggtitle(sprintf("Case vs control %s", feature_name)) +
geom_text(data=medians, aes(x=category, y=med, label=med), size=3, vjust=-1.5)
if ('feature_range_limit' %in% names(ellipsis_args)) {
p <- p + coord_cartesian(ylim=ellipsis_args$feature_range_limit)
}
print(p)
p <- ggplot(data, aes_string(feature_name,
fill="category",
color="category")) +
geom_density(alpha = 0.1) +
ggtitle(sprintf("Case vs control %s", feature_name))
if ('feature_range_limit' %in% names(ellipsis_args)) {
p <- p + coord_cartesian(xlim=ellipsis_args$feature_range_limit)
}
print(p)
}
resample_controls_matching_age <- function(data, width=5, min_age=0) {
cases <- data[ data$`professional-diagnosis` & data$age>=min_age, ]
controls <- data[ !data$`professional-diagnosis`, ]
## create index from age to controls of similar age
index <- lapply(min(cases$age):max(cases$age), function(age) {
if (age > 80) {
which(controls$age > 80)
}
else {
which(controls$age >= age-width & controls$age <= age+width)
}
})
names(index) <- min(cases$age):max(cases$age)
resampled_control_indices <- sapply(sample(cases$age, nrow(cases), replace=TRUE),
function (age) {
sample(index[[as.character(age)]], 1)
})
resampled_controls <- controls[resampled_control_indices,]
rbind(cases, resampled_controls)
}
extract_age_band <- function(data, sds=2) {
cases <- data[ data$`professional-diagnosis`, ]
controls <- data[ !data$`professional-diagnosis`, ]
lower_bound <- median(cases$age) - sds*sd(cases$age)
upper_bound <- median(cases$age) + sds*sd(cases$age)
cases_in_age_band <- cases[cases$age >= lower_bound & cases$age <= upper_bound, ]
controls_in_age_band <- controls[controls$age >= lower_bound & controls$age <= upper_bound, ]
rbind(cases_in_age_band, controls_in_age_band)
}
label_categories <- function(data) {
medTimepointStrings <- c(
'Immediately before Parkinson medication',
'Immediately before taking Parkinson medication',
'Just after Parkinson medication (at your best)',
'Just after taking Parkinson medication (at your best)')
pre <- medTimepointStrings[1:2]
post <- medTimepointStrings[3:4]
## label the three remaining categories
data$category <- ifelse(data$`professional-diagnosis`, 'case', 'control')
data$category[data$`professional-diagnosis` & data$medTimepoint %in% pre] <- 'case pre'
data$category[data$`professional-diagnosis` & data$medTimepoint %in% post] <- 'case post'
## remove activities of cases that are not at pre or post med time points
data <- data[data$category!='case',]
## convert category to an ordered factor
data$category <- factor(data$category, levels=c("case pre", "case post", "control"), ordered=TRUE)
if (length(unique(data$category)) > 3) stop("Unexpected categories: ", unique(data$category))
return(data)
}
check_data <- function(data) {
message(sprintf("dim(data) = %d x %d", nrow(data), ncol(data)))
for (colname in colnames(data)) {
if (any(is.na(data[[colname]]))) {
message(colname, "has NAs")
}
}
print(summary(data))
}
t_test_pre_vs_post <- function(data, feature_name) {
data$date <- as.Date(data$createdOn)
## get rid of weird future dates
data <- data[ data$date <= Sys.Date() + 1, ]
## split data into pre and post sections and merge on health code and date
pre <- data[ data$category=="case pre", c("healthCode", "date", "recordId", "age", feature_name)]
post <- data[ data$category=="case post", c("healthCode", "date", "recordId", feature_name)]
pp <- merge(pre, post, by=c("healthCode", "date"), suffixes=c(".pre", ".post"))
return(t.test(x=pp[[paste0(feature_name, ".pre")]], y=pp[[paste0(feature_name, ".post")]], paired=TRUE))
}
t_test_case_vs_control <- function(data, feature_name) {
case <- data[ data$`professional-diagnosis`, ]
control <- data[ !data$`professional-diagnosis`, ]
return(t.test(x=case[[feature_name]], y=control[[feature_name]], paired=FALSE))
}
## Compare distribution of activities by participants of a given age (black)
## with that of the control population (red) with age matched activities (blue).
plot_age_distribution <- function(data, data_age_matched) {
a <- 3
plot(density(data_age_matched$age[data_age_matched$`professional-diagnosis`], adjust=a),
lwd=2, ylim=c(0,0.06), main="Activity density by participant age",
xlab="Age")
lines(density(data_age_matched$age[!data_age_matched$`professional-diagnosis`], adjust=a), col='blue')
lines(density(data$age[!data$`professional-diagnosis`], adjust=a), col='red')
legend(x="topright", col=c("black", "blue", "red"), lty=1, legend=c("case ages", "cntl ages matched", "cntl ages before matching"), cex=0.8)
}
## Tried two different methods of age matching here, but the results were very
## similar. The "band" method is simpler, so let's stick with that.
age_match <- function(data, method="band", plot=FALSE) {
if (method=="band") {
data_age_matched <- extract_age_band(data, sds=2)
}
else if (method=="resample") {
data_age_matched <- resample_controls_matching_age(data, min_age=50, width=2)
}
else {
stop("Unexpected age matching method: ", method)
}
if (plot) {
plot_age_distribution(data, data_age_matched)
}
data_age_matched
}
```
Read demographics data.
```{r demographics}
demographics_synid <- 'syn5762676'
q <- sprintf('select healthCode, age, "professional-diagnosis" from %s where age is not null and "professional-diagnosis" is not null', demographics_synid)
results <- synTableQuery(q)
demo <- results@values
```
Specify how to match control ages to case ages.
```{r select_age_matching_method}
age_matching_method = "band" ## band or resample
plot_age_matching = TRUE
```
## Balance
Compare case pre-medication, post-medication, and control.
```{r balance}
data <- get_feature_data(demo,
activity_table_synid='syn5762682',
feature_table_synid='syn5987317')
data <- na.omit(data)
data <- age_match(data, method=age_matching_method, plot=plot_age_matching)
data <- label_categories(data)
check_data(data)
print(t_test_pre_vs_post(data, feature_name='zcrAA'))
print(t_test_case_vs_control(data, feature_name='zcrAA'))
plot_case_vs_control(data,
activity_name='Balance',
feature_name='zcrAA')
```
## Gait
Compare case pre-medication, post-medication, and control.
```{r gait}
data <- get_feature_data(demo,
activity_table_synid='syn5762682',
feature_table_synid='syn5987318')
data <- na.omit(data)
data <- age_match(data, method=age_matching_method, plot=plot_age_matching)
data <- label_categories(data)
check_data(data)
print(t_test_pre_vs_post(data, feature_name='F0XY'))
print(t_test_case_vs_control(data, feature_name='F0XY'))
plot_case_vs_control(data,
activity_name="Gait",
feature_name='F0XY',
feature_range_limit=c(0,4))
```
## Tapping
Compare case pre-medication, post-medication, and control.
```{r tapping}
data <- get_feature_data(demo,
activity_table_synid='syn5762680',
feature_table_synid='syn5987315')
data <- na.omit(data)
data <- age_match(data, method=age_matching_method, plot=plot_age_matching)
data <- label_categories(data)
check_data(data)
print(t_test_pre_vs_post(data, feature_name='tap_count'))
print(t_test_case_vs_control(data, feature_name='tap_count'))
plot_case_vs_control(data,
activity_name="Tapping",
feature_name='tap_count')
```
## Voice
Compare case pre-medication, post-medication, and control.
```{r voice}
data <- get_feature_data(demo,
activity_table_synid='syn5762681',
feature_table_synid='syn5987316')
data <- na.omit(data)
data <- age_match(data, method=age_matching_method, plot=plot_age_matching)
data <- label_categories(data)
check_data(data)
print(t_test_pre_vs_post(data, feature_name='medianF0'))
print(t_test_case_vs_control(data, feature_name='medianF0'))
plot_case_vs_control(data,
activity_name="Voice",
feature_name='medianF0')
```
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment