Skip to content

Instantly share code, notes, and snippets.

@IsmailM
Last active July 15, 2018 18:36
Show Gist options
  • Select an option

  • Save IsmailM/f9316c872a35cb034f1974aa045ca23c to your computer and use it in GitHub Desktop.

Select an option

Save IsmailM/f9316c872a35cb034f1974aa045ca23c to your computer and use it in GitHub Desktop.
analysing WGBS data in R
# install.packages('data.table')
# install.packages('ggplot2')
# install.packages('GenomicRanges')
library('data.table')
library('ggplot2')
library(GenomicRanges)
# OUTPUT from GEMBS
read_filtered_cpg_file <- function(file, project_title) {
header <- c('contig', 'position', 'reference.genotype', 'called.genotype', 'genotype.score',
'methylation.value', 'methylation.sd', 'non.converted', 'converted',
'gt.reads', 'total.reads', 'base.counts.pos.1', 'base.counts.pos.2')
d <- fread(paste0('gunzip -c ', file), sep="\t", col.names = header)
d[, type := factor(project_title) ]
# remove '-' in the methylation.value & methylation.sd
d[methylation.value == '-', methylation.value := NA ]
d[methylation.sd == '-', methylation.sd := NA ]
d[, methylation.value := as.numeric(methylation.value) ]
d[, methylation.sd := as.numeric(methylation.sd) ]
return(d)
}
files_sub02_5 <- "filtered_meths/SAMPLE_EGAZ00001016575_sub02_5_cpg.txt.gz"
files_sub05 <- "filtered_meths/SAMPLE_EGAZ00001016575_sub05_cpg.txt.gz"
files_sub10 <- "filtered_meths/SAMPLE_EGAZ00001016575_sub10_cpg.txt.gz"
files_sub25 <- "filtered_meths/SAMPLE_EGAZ00001016575_sub25_cpg.txt.gz"
files_sub50 <- "filtered_meths/SAMPLE_EGAZ00001016575_sub50_cpg.txt.gz"
files_orig_100 <- "filtered_meths/SAMPLE_EGAZ00001016575_cpg.txt.gz"
files <- c(files_sub02_5,files_sub05,files_sub10,files_sub25,files_sub50,files_orig_100)
d02_5 <- read_filtered_cpg_file(files_sub02_5, 2.5)
setkey(d02_5, contig, position)
d05 <- read_filtered_cpg_file(files_sub05, 5)
setkey(d05, contig, position)
d10 <- read_filtered_cpg_file(files_sub10, 10)
setkey(d10, contig, position)
d25 <- read_filtered_cpg_file(files_sub25, 25)
setkey(d25, contig, position)
d50 <- read_filtered_cpg_file(files_sub50, 50)
setkey(d50, contig, position)
dorig <- read_filtered_cpg_file(files_orig_100, 100)
setkey(dorig, contig, position)
# sapply(files, function(f) {
# d <- read_filtered_cpg_file(f, '02_5')
# return(list(
# all=d[ !is.na(methylation.value), .N],
# all_5=d[ !is.na(methylation.value) & total.reads > 5 , .N],
# all_10=d[ !is.na(methylation.value) & total.reads > 10 , .N]
# ))
# })
da <- rbindlist(list(d02_5,d05,d10,d25,d50,dorig))
setkey(da, type, contig, position)
# da[!is.na(methylation.value), .N, by = .(type)]
# type N
# 1: 02_5 22649458
# 2: 05 26211579
# 3: 10 28166584
# 4: 25 28915790
# 5: 50 28999659
# 6: 100 29050560
############ subset to just chr22
# ca22 <- da[contig == 'chr22']
# ca22_s <- ca22[, .(contig, position, methylation.value, type)]
# setkey(ca22_s, type, contig, position)
# methylation_table <- dcast(ca22_s, contig + position ~ type, value.var="methylation.value")
# meth_table <- na.omit(methylation_table)
# in_cols = c('c2.5','c5','c10','c25','c50','c100')
# out_cols = c('diff_c2.5','diff_c5','diff_c10','diff_c25','diff_c50','diff_c100')
# meth_table[, c(out_cols) := c100 - .SD, .SDcols = in_cols]
# names(meth_table)
# do.call(setnames, list(meth_table, c('c2.5','c5','c10','c25','c50','c100'), c('meth_c2.5','meth_c5','meth_c10','meth_c25','meth_c50','meth_c100')))
# colA <- c('meth_c2.5','meth_c5','meth_c10','meth_c25','meth_c50','meth_c100')
# colB <- c('diff_c2.5','diff_c5','diff_c10','diff_c25','diff_c50','diff_c100')
# meth_table[, ]
# meth_table[, ggplot() + geom_histogram(aes(diff_c50),alpha = 1, binwidth=.05, col="red", position="stack") ]
# m <- melt.data.table(meth_table, id.var=c("contig","position"), variable.name = "type")
# diff_m <- m[type %in% colB ]
# ggplot(diff_m) + geom_histogram(aes(value), alpha = 0.1, binwidth=.05, col="red") + xlab('Absolute Difference from Original Methylation Value') + facet_grid( type ~ . )
# ggplot(diff_m) + geom_boxplot(aes(type, value, colour=type)) + xlab("Coverage") + ylab('Absolute Difference from Original Methylation Value')
# sm <- m[type %in% colA ]
####
da_s <- da[, .(contig, position, methylation.value, type)]
setkey(da_s, contig, position)
ggplot(da_s) + geom_density(aes(methylation.value), fill='steelblue', alpha=0.7 ) + facet_grid(type ~ .)
dorig_s <- dorig[, .(contig, position, methylation.value)]
setkey(dorig_s, contig, position)
rs <- da_s[sample(.N,100)]
setkey(rs, contig, position)
da_s1 <- rs[dorig_s, nomatch=0]
str(rs)
str(da_s)
da_s1 <- da_s[dorig_s, nomatch=0]
da_s1[, diff := (methylation.value - i.methylation.value)]
<- da_s1[sample(.N,1000)]
ggplot(da_s1) + geom_histogram(aes(diff), fill='steelblue', alpha=0.7 ) + facet_grid(type ~ .)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment