-
-
Save Balharbi/9f93a134c83515f1908cb4dfe41402d7 to your computer and use it in GitHub Desktop.
annotate RRBS metilene regions
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(ChIPpeakAnno) | |
library(biomaRt) | |
library(genomation) | |
library(dplyr) | |
# Read in metilene results | |
res <- read.table("results_de_novo_all_10.txt", sep = "\t", header = T) | |
head(res) | |
# Select DMRs defined as segments with q.value < 0.05 and absolute methylation difference over 10% | |
# Add column threshold that assumes value TRUE in case of DMRS and FALSE for the rest of the segments | |
res <- res[with(res, order(abs(mean_difference), decreasing = T)),] | |
res$threshold <- as.factor(abs(res$mean_difference) > 10 & res$Q.value < 0.05) | |
# Get mart ensembl annotation | |
ensembl <- useMart("ENSEMBL_MART_ENSEMBL", host="www.ensembl.org") | |
ensembl <- useDataset("hsapiens_gene_ensembl", mart=ensembl) | |
# Convert metilene results data frame to GRanges object and annoate with the nearest TSS | |
# using ChIPpeakAnno | |
res_annot <- annotatePeakInBatch(myPeakList = as(res, "GRanges"), | |
mart=ensembl, | |
featureType = "TSS", | |
output = "nearestLocation", | |
PeakLocForDistance = "middle", | |
select = "first", | |
ignore.strand = T) | |
# Add gene ids | |
res_annot <- addGeneIDs(res_annot, mart = ensembl, feature_id_type = "ensembl_gene_id", | |
IDs2Add = c("entrezgene", "hgnc_symbol", "description")) | |
# Read CpG island bed and add CpG island shores | |
cpg.obj <- genomation::readFeatureFlank(location="~/Projects/CRIO_Saliva_RRBS/cpg_island.bed", | |
feature.flank.name=c("CpGi","shores")) | |
# Annotate with CpG islands and shores | |
cgi_ann <- genomation::annotateWithFeatureFlank(res_annot, cpg.obj$CpGi,cpg.obj$shores, | |
feature.name="CpGi",flank.name="Shores") | |
cgi_mat <- genomation::getMembers(cgi_ann) | |
cgi_mat <- as.data.frame(cgi_mat) | |
res_annot <- as.data.frame(res_annot) | |
res_annot <- data.frame(res_annot, cgi_mat) | |
# Select annotated DMRs | |
dmrs_annot <- dplyr::filter(res_annot, threshold == TRUE) | |
dim(dmrs_annot) | |
# Write out annotated results | |
write.table(res_annot, file = "all_regions_annotated_10.txt", sep = "\t", quote = F, col.names = T, row.names = F) | |
write.table(dmrs_annot, file = "DMRs_annotated_10.txt", sep = "\t", quote = F, col.names = T, row.names = F) | |
# Calculate overlaps with common gene parts | |
gene.parts <- genomation::readTranscriptFeatures("hg38.bed") | |
dmrs_annot <- as(dmrs_annot, "GRanges") | |
dmrs_gene_parts <- genomation::annotateWithGeneParts(dmrs_annot, gene.parts) | |
dmrs_gene_parts | |
# Get overlap stats with exons, promoters, introns and intergenic sequences | |
target_annotations <- as.data.frame(getTargetAnnotationStats(dmrs_gene_parts, percentage=TRUE, precedence=TRUE)) | |
names(target_annotations)[1] <- "percent_overlap" | |
target_annotations$genomic_parts <- rownames(target_annotations) | |
target_annotations <- melt(target_annotations, id.vars = "genomic_parts") | |
tiff("Percent_overlap_DMRs_with_features.tiff", width = 500, height = 500) | |
ggplot(target_annotations, aes(x=genomic_parts, y=value, fill=genomic_parts)) + geom_bar(stat = "identity") + | |
xlab("Genomic parts") + | |
ylab("Percent overlapping") + | |
ggtitle("Percent overlap of DMRs with genomic parts\nPrecedence was set as promoter > exon > intron") + | |
theme(axis.text=element_text(size=12), axis.title = element_text(size=14, face = "bold")) + | |
theme(legend.text=element_text(size=10, face = "bold")) + | |
theme(legend.title=element_text(size=12, face = "bold")) | |
dev.off() | |
# Plot histogram of distances to the colosest TSS | |
tiff("histogram_of_distances_to_closest_TSS.tiff", width = 500, height = 500) | |
ggplot(as.data.frame(dmrs_annot), aes(distancetoFeature)) + geom_histogram(colour = "white", fill = "darkblue") + | |
xlab("Distance to TSS") + | |
ylab("Count") + | |
ggtitle("Histogram of distances from DMRs to the closest\ntranscriptiona start sites") + | |
theme(axis.text=element_text(size=12), axis.title = element_text(size=14, face = "bold")) | |
dev.off() | |
hist(dmrs_annot$shortestDistance) | |
sum(dmrs_annot$CpGi) | |
sum(dmrs_annot$Shores) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment