Last active
July 15, 2018 18:36
-
-
Save IsmailM/f9316c872a35cb034f1974aa045ca23c to your computer and use it in GitHub Desktop.
analysing WGBS data in R
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # install.packages('data.table') | |
| # install.packages('ggplot2') | |
| # install.packages('GenomicRanges') | |
| library('data.table') | |
| library('ggplot2') | |
| library(GenomicRanges) | |
| # OUTPUT from GEMBS | |
| read_filtered_cpg_file <- function(file, project_title) { | |
| header <- c('contig', 'position', 'reference.genotype', 'called.genotype', 'genotype.score', | |
| 'methylation.value', 'methylation.sd', 'non.converted', 'converted', | |
| 'gt.reads', 'total.reads', 'base.counts.pos.1', 'base.counts.pos.2') | |
| d <- fread(paste0('gunzip -c ', file), sep="\t", col.names = header) | |
| d[, type := factor(project_title) ] | |
| # remove '-' in the methylation.value & methylation.sd | |
| d[methylation.value == '-', methylation.value := NA ] | |
| d[methylation.sd == '-', methylation.sd := NA ] | |
| d[, methylation.value := as.numeric(methylation.value) ] | |
| d[, methylation.sd := as.numeric(methylation.sd) ] | |
| return(d) | |
| } | |
| files_sub02_5 <- "filtered_meths/SAMPLE_EGAZ00001016575_sub02_5_cpg.txt.gz" | |
| files_sub05 <- "filtered_meths/SAMPLE_EGAZ00001016575_sub05_cpg.txt.gz" | |
| files_sub10 <- "filtered_meths/SAMPLE_EGAZ00001016575_sub10_cpg.txt.gz" | |
| files_sub25 <- "filtered_meths/SAMPLE_EGAZ00001016575_sub25_cpg.txt.gz" | |
| files_sub50 <- "filtered_meths/SAMPLE_EGAZ00001016575_sub50_cpg.txt.gz" | |
| files_orig_100 <- "filtered_meths/SAMPLE_EGAZ00001016575_cpg.txt.gz" | |
| files <- c(files_sub02_5,files_sub05,files_sub10,files_sub25,files_sub50,files_orig_100) | |
| d02_5 <- read_filtered_cpg_file(files_sub02_5, 2.5) | |
| setkey(d02_5, contig, position) | |
| d05 <- read_filtered_cpg_file(files_sub05, 5) | |
| setkey(d05, contig, position) | |
| d10 <- read_filtered_cpg_file(files_sub10, 10) | |
| setkey(d10, contig, position) | |
| d25 <- read_filtered_cpg_file(files_sub25, 25) | |
| setkey(d25, contig, position) | |
| d50 <- read_filtered_cpg_file(files_sub50, 50) | |
| setkey(d50, contig, position) | |
| dorig <- read_filtered_cpg_file(files_orig_100, 100) | |
| setkey(dorig, contig, position) | |
| # sapply(files, function(f) { | |
| # d <- read_filtered_cpg_file(f, '02_5') | |
| # return(list( | |
| # all=d[ !is.na(methylation.value), .N], | |
| # all_5=d[ !is.na(methylation.value) & total.reads > 5 , .N], | |
| # all_10=d[ !is.na(methylation.value) & total.reads > 10 , .N] | |
| # )) | |
| # }) | |
| da <- rbindlist(list(d02_5,d05,d10,d25,d50,dorig)) | |
| setkey(da, type, contig, position) | |
| # da[!is.na(methylation.value), .N, by = .(type)] | |
| # type N | |
| # 1: 02_5 22649458 | |
| # 2: 05 26211579 | |
| # 3: 10 28166584 | |
| # 4: 25 28915790 | |
| # 5: 50 28999659 | |
| # 6: 100 29050560 | |
| ############ subset to just chr22 | |
| # ca22 <- da[contig == 'chr22'] | |
| # ca22_s <- ca22[, .(contig, position, methylation.value, type)] | |
| # setkey(ca22_s, type, contig, position) | |
| # methylation_table <- dcast(ca22_s, contig + position ~ type, value.var="methylation.value") | |
| # meth_table <- na.omit(methylation_table) | |
| # in_cols = c('c2.5','c5','c10','c25','c50','c100') | |
| # out_cols = c('diff_c2.5','diff_c5','diff_c10','diff_c25','diff_c50','diff_c100') | |
| # meth_table[, c(out_cols) := c100 - .SD, .SDcols = in_cols] | |
| # names(meth_table) | |
| # do.call(setnames, list(meth_table, c('c2.5','c5','c10','c25','c50','c100'), c('meth_c2.5','meth_c5','meth_c10','meth_c25','meth_c50','meth_c100'))) | |
| # colA <- c('meth_c2.5','meth_c5','meth_c10','meth_c25','meth_c50','meth_c100') | |
| # colB <- c('diff_c2.5','diff_c5','diff_c10','diff_c25','diff_c50','diff_c100') | |
| # meth_table[, ] | |
| # meth_table[, ggplot() + geom_histogram(aes(diff_c50),alpha = 1, binwidth=.05, col="red", position="stack") ] | |
| # m <- melt.data.table(meth_table, id.var=c("contig","position"), variable.name = "type") | |
| # diff_m <- m[type %in% colB ] | |
| # ggplot(diff_m) + geom_histogram(aes(value), alpha = 0.1, binwidth=.05, col="red") + xlab('Absolute Difference from Original Methylation Value') + facet_grid( type ~ . ) | |
| # ggplot(diff_m) + geom_boxplot(aes(type, value, colour=type)) + xlab("Coverage") + ylab('Absolute Difference from Original Methylation Value') | |
| # sm <- m[type %in% colA ] | |
| #### | |
| da_s <- da[, .(contig, position, methylation.value, type)] | |
| setkey(da_s, contig, position) | |
| ggplot(da_s) + geom_density(aes(methylation.value), fill='steelblue', alpha=0.7 ) + facet_grid(type ~ .) | |
| dorig_s <- dorig[, .(contig, position, methylation.value)] | |
| setkey(dorig_s, contig, position) | |
| rs <- da_s[sample(.N,100)] | |
| setkey(rs, contig, position) | |
| da_s1 <- rs[dorig_s, nomatch=0] | |
| str(rs) | |
| str(da_s) | |
| da_s1 <- da_s[dorig_s, nomatch=0] | |
| da_s1[, diff := (methylation.value - i.methylation.value)] | |
| <- da_s1[sample(.N,1000)] | |
| ggplot(da_s1) + geom_histogram(aes(diff), fill='steelblue', alpha=0.7 ) + facet_grid(type ~ .) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment