Last active
January 16, 2017 15:00
-
-
Save mlist/c18b858994e7c8d699da77c116462846 to your computer and use it in GitHub Desktop.
Demo script for batch effect analysis of DEEP data using DeepBlueR
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# authentication, register for DeepBlue, log in, open your account info, | |
# copy and paste user key here | |
auth_key <- "xxxxxxxxxx" | |
#dependencies | |
library(foreach) | |
library(matrixStats) | |
library(DeepBlueR) | |
library(stringr) | |
library(corrplot) | |
library(deepRtools) | |
library(impute) | |
library(dplyr) | |
library(ggplot2) | |
library(RColorBrewer) | |
library(ggrepel) | |
# epigenetic marks to consider | |
epi_marks <- c("DNA methylation", | |
"DNA accessibility", | |
"H3K4me3", | |
"H3K27me3", | |
"H3K27ac", | |
"H3K4me1", | |
"H3K9me3", | |
"H3K36me3") | |
# method for plotting first two PCs | |
plot_first_two_pcas <- function(pcas, meta, pc1.threshold.for.label = 1, color_by = "BIOMATERIAL_PROVIDER"){ | |
plot.data <- as.data.frame(pcas$rotation) %>% | |
tibble::rownames_to_column(var = "DEEP_SAMPLE_ID") %>% | |
dplyr::left_join(meta, by=c("DEEP_SAMPLE_ID")) | |
deep_ids <- deepRtools::deepSampleIds2dataFrame(meta$DEEP_SAMPLE_ID) | |
plot.data <- dplyr::left_join(plot.data, deep_ids, by=c("DEEP_SAMPLE_ID" = "id")) | |
colourCount <- 9 | |
getPalette <- colorRampPalette(brewer.pal(colourCount, "Set1")) | |
ggplot(plot.data, aes_string(x = "PC1", | |
y = "PC2", | |
label = "DEEP_SAMPLE_ID", | |
color = color_by, | |
shape = "organ_short"))+ | |
geom_point(size = 5, alpha = 0.5) + | |
geom_label_repel(data = plot.data[which(pcas$rotation[,1] < pc1.threshold.for.label),]) + | |
theme_bw() + | |
xlab(paste("PC1 (", round(pcas$sdev[1]^2/sum(pcas$sdev^2), 2) * 100, "%)")) + | |
ylab(paste("PC2 (", round(pcas$sdev[2]^2/sum(pcas$sdev^2), 2) * 100, "%)")) | |
} | |
batch_effect_results <- foreach(epi_mark = epi_marks) %do%{ | |
deep_experiments <- deepblue_list_experiments(project = "DEEP", | |
user_key = auth_key, | |
epigenetic_mark = epi_mark, | |
type = "signal", | |
genome = "hs37d5") | |
if(epi_mark == "DNA methylation"){ | |
deep_experiments <- deep_experiments[grep("filtered.CG.bedgraph", deepblue_extract_names(deep_experiments)),] | |
}else if(epi_mark == "DNA accessibility"){ | |
deep_experiments <- deep_experiments[grep("filtered.GCH.bedgraph", deepblue_extract_names(deep_experiments)),] | |
}else{ | |
deep_experiments <- deep_experiments[-grep("nodup|bamcomp", deepblue_extract_names(deep_experiments)),] | |
} | |
# extract sample meta data | |
deep_meta <- deepblue_info(deepblue_extract_ids(deep_experiments), user_key = auth_key) | |
# transform list to data frame | |
deep_meta_df <- unique(foreach(sample = deep_meta, | |
.combine = bind_rows, | |
.inorder = TRUE) %do% { | |
meta <- as.data.frame(sample$sample_info) | |
meta$epigenetic_mark <- sample$epigenetic_mark | |
meta$data_type <- sample$data_type | |
meta$genome <- sample$genome | |
meta$experiment <- sample$name | |
meta$technique <- sample$technique | |
meta <- cbind(meta, sample$extra_metadata) | |
return(meta) | |
}) | |
experiments_columns <- list() | |
for(experiment in deepblue_extract_names(deep_experiments)) { | |
experiments_columns[[experiment]] <- "VALUE" | |
} | |
tiling_regions <- deepblue_tiling_regions(size=5000, chromosome = "22", genome="hs37d5", user_key = auth_key) | |
request_id <- deepblue_score_matrix( | |
experiments_columns = experiments_columns, | |
aggregation_function = "mean", aggregation_regions_id=tiling_regions, | |
user_key = auth_key) | |
score_matrix <- deepblue_download_request_data(request_id = request_id, user_key = auth_key) | |
filtered_score_matrix <- scale(as.matrix(score_matrix[,4:ncol(score_matrix)])) | |
#remove regions with more than NA values | |
filtered_score_matrix <- filtered_score_matrix[which(complete.cases(filtered_score_matrix)),] | |
#commented out: optional filter for variable regions only | |
#filtered_score_matrix <- filtered_score_matrix[ | |
# which(rowSums(is.na(filtered_score_matrix)) / | |
# ncol(filtered_score_matrix) < 0.05),] | |
#keep only regions with more than 0.05 variance | |
#filtered_score_matrix_rowVars <- rowVars(filtered_score_matrix, na.rm = T) | |
#filtered_score_matrix <- filtered_score_matrix[which(filtered_score_matrix_rowVars > .05),] | |
colnames(filtered_score_matrix) <- deep_meta_df$DEEP_SAMPLE_ID | |
score_matrix_col_sums <- colSums(is.na(filtered_score_matrix)) / | |
nrow(filtered_score_matrix) | |
problematic_samples <- which(score_matrix_col_sums > 0.1) | |
if(length(problematic_samples) > 0){ | |
warning(paste("omitted samples with > 10% missing entries:", paste(colnames(filtered_score_matrix)[problematic_samples], collapse = ","), sep= " ")) | |
filtered_score_matrix <- filtered_score_matrix[,-problematic_samples] | |
cat("% of missing values per sample:") | |
print(score_matrix_col_sums) | |
} | |
filtered_score_matrix <- impute.knn(as.matrix(filtered_score_matrix)[,-seq(1,3)], k = 3) | |
attr(filtered_score_matrix, "meta") <- deep_meta_df | |
return(filtered_score_matrix) | |
} | |
# generate plots | |
for(i in 1:length(epi_marks)){ | |
score_matrix <- batch_effect_results[[i]]$data | |
corrplot(cor(score_matrix, | |
use = "pairwise.complete.obs"), | |
title = epi_marks[[i]], | |
mar = c(1,1,1,1), tl.cex = 0.8, tl.col = "black") | |
pcas <-prcomp(score_matrix, center = TRUE, scale. = TRUE) | |
print(plot_first_two_pcas(pcas, attr(batch_effect_results[[i]], "meta"), | |
color_by = "biosource_name", | |
pc1.threshold.for.label = 0.175) + | |
ggtitle(epi_marks[[i]]) + | |
theme(legend.position="bottom",legend.direction="horizontal")) | |
} | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
### SETUP ### | |
library(foreach) | |
library(DeepBlueR) | |
library(matrixStats) | |
deepblue_options(user_key = "qWe1anLjd66OfTAn") | |
# epigenetic marks to consider | |
epi_marks <- c("DNA methylation", | |
"DNA accessibility", | |
"H3K4me3", | |
"H3K27me3", | |
"H3K27ac", | |
"H3K4me1", | |
"H3K9me3", | |
"H3K36me3") | |
# annotations to consider | |
tiling_1k <- deepblue_tiling_regions(size=1000,genome="hs37d5") | |
tiling_2k <- deepblue_tiling_regions(size=2000,genome="hs37d5") | |
tiling_5k <- deepblue_tiling_regions(size=5000,genome="hs37d5") | |
regulatory <- deepblue_select_annotations("ENSEMBL regulatory build", "hs37d5") | |
annotations <- c(tiling_1k, tiling_2k, tiling_5k, regulatory) | |
annotation_names <- c("1kb tiling", "2kb tiling", "5kb tiling", "ENSEMBL regulatory build") | |
### SELECT EXPERIMENTS AND REQUEST SCORE MATRICES ### | |
batch_effect_request_ids <- foreach(epi_mark = epi_marks, | |
.inorder = TRUE, | |
.final = function(x) | |
setNames(x, epi_marks)) %do% { | |
deep_experiments <- deepblue_list_experiments(project = "DEEP", | |
epigenetic_mark = epi_mark, | |
type = "signal", | |
genome = "hs37d5") | |
if(epi_mark == "DNA methylation"){ | |
deep_experiments <- deep_experiments[grep("filtered.CG.bedgraph", deepblue_extract_names(deep_experiments)),] | |
}else if(epi_mark == "DNA accessibility"){ | |
deep_experiments <- deep_experiments[grep("filtered.GCH.bedgraph", deepblue_extract_names(deep_experiments)),] | |
}else{ | |
deep_experiments <- deep_experiments[-grep("nodup|bamcomp", deepblue_extract_names(deep_experiments)),] | |
} | |
# extract sample meta data | |
deep_meta <- deepblue_meta_data_to_table(deepblue_extract_ids(deep_experiments)) | |
experiments_columns <- deepblue_select_column(deep_experiments, | |
column = "VALUE") | |
result <- foreach(annotation = annotations, | |
.inorder = TRUE, | |
.final = function(x) setNames(x, annotation_names)) %do% { | |
request_id <- deepblue_score_matrix( | |
experiments_columns = experiments_columns, | |
aggregation_function = "mean", | |
aggregation_regions_id = annotation) | |
return(request_id) | |
} | |
attr(result, "meta") <- deep_meta | |
return(result) | |
} | |
### CHECK THE PROGRESS ### | |
foreach(epi_mark = epi_marks, .combine = rbind) %do%{ | |
foreach(annotation = annotation_names, .combine = rbind) %do% { | |
request_id <- batch_effect_request_ids[[epi_mark]][[annotation]] | |
info <- deepblue_info(request_id) | |
if(info$state %in% c("running", "new")){ | |
time_elapsed = format(difftime( | |
Sys.time(), | |
strptime(info$create_time, | |
"%Y-%b-%d %H:%M:%S", tz = "GMT") | |
) | |
)} | |
else if(info$state == "error"){ | |
time_elapsed = NA | |
} | |
else{ | |
time_elapsed = format(difftime( | |
strptime(info$finish_time, | |
"%Y-%b-%d %H:%M:%S", tz = "GMT"), | |
strptime(info$create_time, | |
"%Y-%b-%d %H:%M:%S", tz = "GMT") | |
) | |
) | |
} | |
data.frame(epi_mark = epi_mark, | |
annotation = annotation, | |
state = info$state, | |
time_elapsed = time_elapsed | |
) | |
} | |
} | |
#select the ones already finished... | |
epi_marks <- c("DNA methylation") | |
annotation_names <- c("ENSEMBL regulatory build", "5kb tiling") | |
### DOWNLOAD RESULTS ### | |
batch_effect_results <- foreach(epi_mark = epi_marks, | |
.inorder = TRUE, | |
.final = function(x) | |
setNames(x, epi_marks)) %do% { | |
foreach(annotation = annotation_names, | |
.inorder = TRUE, | |
.final = function(x) setNames(x, annotation_names)) %do% { | |
request_id <- batch_effect_request_ids[[epi_mark]][[annotation]] | |
if(is.null(request_id)) stop(paste("request id not found for", epi_mark, annotation)) | |
score_matrix <- deepblue_download_request_data(request_id = request_id) | |
filtered_score_matrix <- scale(as.matrix(score_matrix[,4:ncol(score_matrix)])) | |
#removing regions with > 10% missing values | |
score_matrix_row_sums <- rowSums(is.na(filtered_score_matrix)) / | |
ncol(filtered_score_matrix) | |
problematic_regions <- which(score_matrix_row_sums > 0.1) | |
if(length(problematic_regions) > 0){ | |
warning(paste("omitted regions with > 10% missing entries")) | |
filtered_score_matrix <- filtered_score_matrix[-problematic_regions,] | |
} | |
#removing samples with > 10% missing values | |
score_matrix_col_sums <- colSums(is.na(filtered_score_matrix)) / | |
nrow(filtered_score_matrix) | |
problematic_samples <- which(score_matrix_col_sums > 0.1) | |
if(length(problematic_samples) > 0){ | |
warning(paste("omitted samples with > 10% missing entries:", | |
paste(colnames(filtered_score_matrix)[problematic_samples], collapse = ","), sep= " ")) | |
filtered_score_matrix <- filtered_score_matrix[,-problematic_samples] | |
cat("% of missing values per sample:") | |
print(score_matrix_col_sums) | |
} | |
#imputing missing values | |
#message("imputing missing values with k nearest neighbor (k = 3)") | |
#filtered_score_matrix <- impute.knn(as.matrix(filtered_score_matrix), k = 3) | |
return(filtered_score_matrix) | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment