Skip to content

Instantly share code, notes, and snippets.

@mlist
Last active January 16, 2017 15:00
Show Gist options
  • Save mlist/c18b858994e7c8d699da77c116462846 to your computer and use it in GitHub Desktop.
Save mlist/c18b858994e7c8d699da77c116462846 to your computer and use it in GitHub Desktop.
Demo script for batch effect analysis of DEEP data using DeepBlueR
# authentication, register for DeepBlue, log in, open your account info,
# copy and paste user key here
auth_key <- "xxxxxxxxxx"
#dependencies
library(foreach)
library(matrixStats)
library(DeepBlueR)
library(stringr)
library(corrplot)
library(deepRtools)
library(impute)
library(dplyr)
library(ggplot2)
library(RColorBrewer)
library(ggrepel)
# epigenetic marks to consider
epi_marks <- c("DNA methylation",
"DNA accessibility",
"H3K4me3",
"H3K27me3",
"H3K27ac",
"H3K4me1",
"H3K9me3",
"H3K36me3")
# method for plotting first two PCs
plot_first_two_pcas <- function(pcas, meta, pc1.threshold.for.label = 1, color_by = "BIOMATERIAL_PROVIDER"){
plot.data <- as.data.frame(pcas$rotation) %>%
tibble::rownames_to_column(var = "DEEP_SAMPLE_ID") %>%
dplyr::left_join(meta, by=c("DEEP_SAMPLE_ID"))
deep_ids <- deepRtools::deepSampleIds2dataFrame(meta$DEEP_SAMPLE_ID)
plot.data <- dplyr::left_join(plot.data, deep_ids, by=c("DEEP_SAMPLE_ID" = "id"))
colourCount <- 9
getPalette <- colorRampPalette(brewer.pal(colourCount, "Set1"))
ggplot(plot.data, aes_string(x = "PC1",
y = "PC2",
label = "DEEP_SAMPLE_ID",
color = color_by,
shape = "organ_short"))+
geom_point(size = 5, alpha = 0.5) +
geom_label_repel(data = plot.data[which(pcas$rotation[,1] < pc1.threshold.for.label),]) +
theme_bw() +
xlab(paste("PC1 (", round(pcas$sdev[1]^2/sum(pcas$sdev^2), 2) * 100, "%)")) +
ylab(paste("PC2 (", round(pcas$sdev[2]^2/sum(pcas$sdev^2), 2) * 100, "%)"))
}
batch_effect_results <- foreach(epi_mark = epi_marks) %do%{
deep_experiments <- deepblue_list_experiments(project = "DEEP",
user_key = auth_key,
epigenetic_mark = epi_mark,
type = "signal",
genome = "hs37d5")
if(epi_mark == "DNA methylation"){
deep_experiments <- deep_experiments[grep("filtered.CG.bedgraph", deepblue_extract_names(deep_experiments)),]
}else if(epi_mark == "DNA accessibility"){
deep_experiments <- deep_experiments[grep("filtered.GCH.bedgraph", deepblue_extract_names(deep_experiments)),]
}else{
deep_experiments <- deep_experiments[-grep("nodup|bamcomp", deepblue_extract_names(deep_experiments)),]
}
# extract sample meta data
deep_meta <- deepblue_info(deepblue_extract_ids(deep_experiments), user_key = auth_key)
# transform list to data frame
deep_meta_df <- unique(foreach(sample = deep_meta,
.combine = bind_rows,
.inorder = TRUE) %do% {
meta <- as.data.frame(sample$sample_info)
meta$epigenetic_mark <- sample$epigenetic_mark
meta$data_type <- sample$data_type
meta$genome <- sample$genome
meta$experiment <- sample$name
meta$technique <- sample$technique
meta <- cbind(meta, sample$extra_metadata)
return(meta)
})
experiments_columns <- list()
for(experiment in deepblue_extract_names(deep_experiments)) {
experiments_columns[[experiment]] <- "VALUE"
}
tiling_regions <- deepblue_tiling_regions(size=5000, chromosome = "22", genome="hs37d5", user_key = auth_key)
request_id <- deepblue_score_matrix(
experiments_columns = experiments_columns,
aggregation_function = "mean", aggregation_regions_id=tiling_regions,
user_key = auth_key)
score_matrix <- deepblue_download_request_data(request_id = request_id, user_key = auth_key)
filtered_score_matrix <- scale(as.matrix(score_matrix[,4:ncol(score_matrix)]))
#remove regions with more than NA values
filtered_score_matrix <- filtered_score_matrix[which(complete.cases(filtered_score_matrix)),]
#commented out: optional filter for variable regions only
#filtered_score_matrix <- filtered_score_matrix[
# which(rowSums(is.na(filtered_score_matrix)) /
# ncol(filtered_score_matrix) < 0.05),]
#keep only regions with more than 0.05 variance
#filtered_score_matrix_rowVars <- rowVars(filtered_score_matrix, na.rm = T)
#filtered_score_matrix <- filtered_score_matrix[which(filtered_score_matrix_rowVars > .05),]
colnames(filtered_score_matrix) <- deep_meta_df$DEEP_SAMPLE_ID
score_matrix_col_sums <- colSums(is.na(filtered_score_matrix)) /
nrow(filtered_score_matrix)
problematic_samples <- which(score_matrix_col_sums > 0.1)
if(length(problematic_samples) > 0){
warning(paste("omitted samples with > 10% missing entries:", paste(colnames(filtered_score_matrix)[problematic_samples], collapse = ","), sep= " "))
filtered_score_matrix <- filtered_score_matrix[,-problematic_samples]
cat("% of missing values per sample:")
print(score_matrix_col_sums)
}
filtered_score_matrix <- impute.knn(as.matrix(filtered_score_matrix)[,-seq(1,3)], k = 3)
attr(filtered_score_matrix, "meta") <- deep_meta_df
return(filtered_score_matrix)
}
# generate plots
for(i in 1:length(epi_marks)){
score_matrix <- batch_effect_results[[i]]$data
corrplot(cor(score_matrix,
use = "pairwise.complete.obs"),
title = epi_marks[[i]],
mar = c(1,1,1,1), tl.cex = 0.8, tl.col = "black")
pcas <-prcomp(score_matrix, center = TRUE, scale. = TRUE)
print(plot_first_two_pcas(pcas, attr(batch_effect_results[[i]], "meta"),
color_by = "biosource_name",
pc1.threshold.for.label = 0.175) +
ggtitle(epi_marks[[i]]) +
theme(legend.position="bottom",legend.direction="horizontal"))
}
### SETUP ###
library(foreach)
library(DeepBlueR)
library(matrixStats)
deepblue_options(user_key = "qWe1anLjd66OfTAn")
# epigenetic marks to consider
epi_marks <- c("DNA methylation",
"DNA accessibility",
"H3K4me3",
"H3K27me3",
"H3K27ac",
"H3K4me1",
"H3K9me3",
"H3K36me3")
# annotations to consider
tiling_1k <- deepblue_tiling_regions(size=1000,genome="hs37d5")
tiling_2k <- deepblue_tiling_regions(size=2000,genome="hs37d5")
tiling_5k <- deepblue_tiling_regions(size=5000,genome="hs37d5")
regulatory <- deepblue_select_annotations("ENSEMBL regulatory build", "hs37d5")
annotations <- c(tiling_1k, tiling_2k, tiling_5k, regulatory)
annotation_names <- c("1kb tiling", "2kb tiling", "5kb tiling", "ENSEMBL regulatory build")
### SELECT EXPERIMENTS AND REQUEST SCORE MATRICES ###
batch_effect_request_ids <- foreach(epi_mark = epi_marks,
.inorder = TRUE,
.final = function(x)
setNames(x, epi_marks)) %do% {
deep_experiments <- deepblue_list_experiments(project = "DEEP",
epigenetic_mark = epi_mark,
type = "signal",
genome = "hs37d5")
if(epi_mark == "DNA methylation"){
deep_experiments <- deep_experiments[grep("filtered.CG.bedgraph", deepblue_extract_names(deep_experiments)),]
}else if(epi_mark == "DNA accessibility"){
deep_experiments <- deep_experiments[grep("filtered.GCH.bedgraph", deepblue_extract_names(deep_experiments)),]
}else{
deep_experiments <- deep_experiments[-grep("nodup|bamcomp", deepblue_extract_names(deep_experiments)),]
}
# extract sample meta data
deep_meta <- deepblue_meta_data_to_table(deepblue_extract_ids(deep_experiments))
experiments_columns <- deepblue_select_column(deep_experiments,
column = "VALUE")
result <- foreach(annotation = annotations,
.inorder = TRUE,
.final = function(x) setNames(x, annotation_names)) %do% {
request_id <- deepblue_score_matrix(
experiments_columns = experiments_columns,
aggregation_function = "mean",
aggregation_regions_id = annotation)
return(request_id)
}
attr(result, "meta") <- deep_meta
return(result)
}
### CHECK THE PROGRESS ###
foreach(epi_mark = epi_marks, .combine = rbind) %do%{
foreach(annotation = annotation_names, .combine = rbind) %do% {
request_id <- batch_effect_request_ids[[epi_mark]][[annotation]]
info <- deepblue_info(request_id)
if(info$state %in% c("running", "new")){
time_elapsed = format(difftime(
Sys.time(),
strptime(info$create_time,
"%Y-%b-%d %H:%M:%S", tz = "GMT")
)
)}
else if(info$state == "error"){
time_elapsed = NA
}
else{
time_elapsed = format(difftime(
strptime(info$finish_time,
"%Y-%b-%d %H:%M:%S", tz = "GMT"),
strptime(info$create_time,
"%Y-%b-%d %H:%M:%S", tz = "GMT")
)
)
}
data.frame(epi_mark = epi_mark,
annotation = annotation,
state = info$state,
time_elapsed = time_elapsed
)
}
}
#select the ones already finished...
epi_marks <- c("DNA methylation")
annotation_names <- c("ENSEMBL regulatory build", "5kb tiling")
### DOWNLOAD RESULTS ###
batch_effect_results <- foreach(epi_mark = epi_marks,
.inorder = TRUE,
.final = function(x)
setNames(x, epi_marks)) %do% {
foreach(annotation = annotation_names,
.inorder = TRUE,
.final = function(x) setNames(x, annotation_names)) %do% {
request_id <- batch_effect_request_ids[[epi_mark]][[annotation]]
if(is.null(request_id)) stop(paste("request id not found for", epi_mark, annotation))
score_matrix <- deepblue_download_request_data(request_id = request_id)
filtered_score_matrix <- scale(as.matrix(score_matrix[,4:ncol(score_matrix)]))
#removing regions with > 10% missing values
score_matrix_row_sums <- rowSums(is.na(filtered_score_matrix)) /
ncol(filtered_score_matrix)
problematic_regions <- which(score_matrix_row_sums > 0.1)
if(length(problematic_regions) > 0){
warning(paste("omitted regions with > 10% missing entries"))
filtered_score_matrix <- filtered_score_matrix[-problematic_regions,]
}
#removing samples with > 10% missing values
score_matrix_col_sums <- colSums(is.na(filtered_score_matrix)) /
nrow(filtered_score_matrix)
problematic_samples <- which(score_matrix_col_sums > 0.1)
if(length(problematic_samples) > 0){
warning(paste("omitted samples with > 10% missing entries:",
paste(colnames(filtered_score_matrix)[problematic_samples], collapse = ","), sep= " "))
filtered_score_matrix <- filtered_score_matrix[,-problematic_samples]
cat("% of missing values per sample:")
print(score_matrix_col_sums)
}
#imputing missing values
#message("imputing missing values with k nearest neighbor (k = 3)")
#filtered_score_matrix <- impute.knn(as.matrix(filtered_score_matrix), k = 3)
return(filtered_score_matrix)
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment