Created
September 10, 2021 12:26
-
-
Save PoisonAlien/19698548b50a2f9d32b0c8cfc9be9da6 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#################################################################################### | |
# | |
# Best-practice 450k/EPIC QC and preprocessing workflow for the PPCG project | |
# | |
# creator: Pavlo Lutsik | |
# | |
# 30.01.2021 | |
#################################################################################### | |
library(RnBeads) | |
library(grid) | |
library(wateRmelon) | |
libra | |
#devtools::load_all("~/RnBeads_mmbc/", reset=FALSE) | |
rnb.options(disk.dump.big.matrices=FALSE) | |
rnb.options(region.types=character(0)) | |
# 0. Configuration (on b370-worker01) | |
DATA_DIR<-"../01-raw_data/GDCdata/TCGA-ACC/" | |
IDAT_DIR<-file.path(DATA_DIR) | |
#SAMPLE_SHEET<-file.path(DATA_DIR, "Annotation", "PPCA_Sample_annotation_an_PL.txt") | |
SAMPLE_SHEET<-file.path(DATA_DIR, "sample_table.csv") | |
#ID_COL="Meth_Local_Sample_ID" | |
ID_COL="Tumor_Sample_Barcode" | |
#OUT_DIR<-"/local3/PPCG/analysis/rnbeads/" | |
OUT_DIR<-"ACC" | |
dir.create(OUT_DIR, showWarnings = FALSE) | |
################################################### 1. Loading | |
sample_sheet<-read.csv(SAMPLE_SHEET, sep=",", as.is=TRUE) | |
rnb.options(identifiers.column=ID_COL) | |
#sample_sheet$barcode<-sample_sheet$Barcode | |
#sample_sheet$barcode<-gsub("_Grn|_Grn.idat|_Grb.idat|_Grn.idat.gz", "", sample_sheet$idat_grn_filename) | |
#sample_sheet<-sample_sheet[sample_sheet$Platform %in% c("450K", "450k"),] | |
#sample_sheet<-sample_sheet[sample_sheet[[ID_COL]] == "PPCG0388m_450k",] | |
#quick workaround | |
#sample_sheet<-sample_sheet[-which(duplicated(sample_sheet[[ID_COL]])),] | |
rnb.set.raw<-rnb.execute.import(data.source=list(IDAT_DIR, sample_sheet)) | |
## save the dataset | |
#save.rnb.set(rnb.set.raw, path=file.path(DATA_DIR, "PPCG_rnb_set_raw")) | |
#################################################### 2. QC | |
### control boxplots | |
ctypes<-rnb.infinium.control.targets(rnb.set.raw@target) | |
plots<-lapply(ctypes, function (ct) rnb.plot.control.boxplot(rnb.set.raw, ct)) | |
grb<-arrangeGrob(grobs=plots, respect=FALSE) | |
pdf(file.path(OUT_DIR, "qc_boxplots.pdf"), width=20, height=20) | |
grid.draw(grb) | |
dev.off() | |
### SNP heatmap | |
pdf(file.path(OUT_DIR, "snp_heatmap.pdf"), width=7, height=7) | |
rnb.plot.snp.heatmap(rnb.set.raw) | |
dev.off() | |
## experiment with the QC data | |
qc_data<-qc(rnb.set.raw) | |
qc_anno<-rnb.get.annotation(gsub("probes", "controls", rnb.set.raw@target)) | |
colnames(qc_data[["Cy5"]])<-paste0(samples(rnb.set.raw), "_Grn") | |
colnames(qc_data[["Cy3"]])<-paste0(samples(rnb.set.raw), "_Red") | |
qc_data<-cbind(qc_anno, qc_data[["Cy5"]], qc_data[["Cy3"]]) | |
#### | |
qc_df<-data.frame(Sample=samples(rnb.set.raw), Group= OUT_DIR, Cohort = OUT_DIR) | |
## number of bad detection p-values | |
DPVAL_MAX<-0.05 | |
dp<-dpval(rnb.set.raw) | |
qc_df$High_dpval_count<-colSums(dp>DPVAL_MAX) | |
p<-ggplot(qc_df, aes(x=Group, y=High_dpval_count)) + geom_violin() + geom_jitter(aes(color=Cohort, shape=Cohort)) | |
p<-p+scale_shape_manual(values=5:13) | |
p<-p + ggtitle(paste("Number of detection P-values >", DPVAL_MAX)) | |
pdf(file.path(OUT_DIR, "dpval_violin.pdf"), width=7, height=7) | |
print(p) | |
dev.off() | |
p<-p+scale_y_log10() | |
pdf(file.path(OUT_DIR, "dpval_violin_log10.pdf"), width=7, height=7) | |
print(p) | |
dev.off() | |
## median bead count distribution | |
BEAD_NUMBER_MIN<-3 | |
bc<-covg(rnb.set.raw) | |
qc_df$Median_bead_counts<-colMedians(bc) | |
p<-ggplot(qc_df, aes(x=Group, y=Median_bead_counts)) + geom_violin() + geom_jitter(aes(color=Cohort, shape=Cohort)) | |
p<-p+scale_shape_manual(values=5:13) | |
p<-p + ggtitle(paste("Median bead count")) | |
pdf(file.path(OUT_DIR, "bc_median.pdf"), width=7, height=7) | |
print(p) | |
dev.off() | |
qc_df$Low_bead_counts<-colSums(bc<BEAD_NUMBER_MIN) | |
p<-ggplot(qc_df, aes(x=Group, y=Low_bead_counts)) + geom_violin() + geom_jitter(aes(color=Cohort, shape=Cohort)) | |
p<-p+scale_shape_manual(values=5:13) | |
p<-p + ggtitle(paste("Number of samples with bead count<",BEAD_NUMBER_MIN)) | |
pdf(file.path(OUT_DIR, "bc_low.pdf"), width=7, height=7) | |
print(p) | |
dev.off() | |
###### SNP distances | |
mm<-meth(rnb.set.raw, row.names=TRUE) | |
mm.snp<-mm[grep("rs" ,rownames(mm)),] | |
dd.snp<-as.matrix(dist(t(mm.snp))) | |
dist.self<-diag(dd.snp) | |
snp_dist_df<-data.frame(Group="PPCG", Value=as.numeric(dd.snp[lower.tri(dd.snp)])) | |
p<-ggplot(snp_dist_df, aes(x=Group, y=Value)) + geom_violin() + geom_jitter(height=0)# aes(color=Cohort, shape=Cohort)) | |
pdf(file.path(OUT_DIR, "snp_dist_distribution.pdf"), width=7, height=7) | |
print(p) | |
dev.off() | |
#################################################### 3. Normalization | |
NORM_METHOD<-"scaling" | |
#BG_METHOD<-"enmix.oob" | |
BG_METHOD<-"subtraction" | |
### workflow 107, the best from the internal benchmarking study | |
### pOOBAH filtering + methylumi NOOB bg subtraction + watermelon nanes + BMIQ | |
NORM_METHOD<-"wm.nanes+bmiq" | |
BG_METHOD<-"methylumi.noob" | |
rnb.set.norm<-rnb.execute.normalization(object = rnb.set.raw, method = NORM_METHOD, bgcorr.method = BG_METHOD, verbose = TRUE) | |
save.rnb.set(rnb.set.norm, path=file.path(DATA_DIR, "PPCG_rnb_set_raw")) | |
#################################################### 4. Filtering | |
#### PARAMETERS | |
BEAD_NUMBER_MIN<-3L | |
DPVAL_MAX<-0.05 | |
SNP_FILTER<-c("no", "3", "5", "any", "yes")[3L] | |
NON_CG_CONTEXT<-c("CC","CAG","CAH","CTG","CTH","Other") | |
NA_QUANTILE_PROBES<-0.01 #threshold: Maximum quantile of 'NA's allowed per site. This must be a value between 0 and 1. | |
NA_QUANTILE_SAMPLES<-0.002 | |
MAX_LOW_NBEADS<-0.01 | |
MAX_HIGH_DPVAL<-0.01 | |
################## | |
## 4.1 Greedycut | |
#Executes the Greedycut procedure for probe and sample filtering | |
#based on the detection p-values, and calculates statistics on its | |
#iterations. | |
#DPVAL_MAX<-0.05 | |
#BEAD_NUMBER_MIN<-3L | |
#rnb.options(filtering.greedycut.pvalue.threshold=DPVAL_MAX) | |
#rnb.options(filtering.coverage.threshold=BEAD_NUMBER_MIN) | |
if(FALSE){ | |
greedycut_result<-rnb.execute.greedycut(rnb.set.norm, pval.threshold=1, min.coverage=BEAD_NUMBER_MIN) | |
rem<-length(greedycut_result$sites) | |
qc_df$Greedycut_removed<-rem | |
all<-nrow(rnb.set.norm@sites) | |
cat(sprintf("[Greedycut] removed %d of %d probes\n", rem, all, 100*rem/all)) | |
rnb.set<-remove.sites(rnb.set.norm, greedycut_result$sites) | |
if(FALSE){ | |
rnb.set<-remove.samples(rnb.set, greedycut_result$samples) | |
} | |
} | |
## 4.1.1 POOBAH | |
## p-value with Out-Of-Band Array Hybridization | |
rnb.set.mod<-rnb.execute.pOOBAH(rnb.set, pval.thresh=DPVAL_MAX, verbose=TRUE) | |
na.orig<-is.na(meth(rnb.set)) | |
na.poobah<-is.na(meth(rnb.set.mod)) | |
na.poobah[na.orig]<-FALSE | |
qc_df$pOOBAH_masked_count<-colSums(na.poobah) | |
#rnb.set<-rnb.set.mod | |
## 4.2 Mask remaining low detection and low bead number | |
dpval_mask_result<-rnb.execute.high.dpval.masking(rnb.set.mod, dpval.threshold=DPVAL_MAX) | |
dpval_mask_result$dataset.before<-NULL | |
rnb.set<-dpval_mask_result$dataset | |
dpval_mask_result$dataset<-NULL | |
qc_df$High_dpval_masked_count<-colSums(dpval_mask_result$mask) | |
rem<-sum(qc_df$High_dpval_masked_count) | |
all<-prod(dim(meth(rnb.set))) | |
cat(sprintf("[High detection p-value masking] masked %d values out of %d, %1.3f %%\n", rem, all, 100*rem/all)) | |
###### | |
covg_mask_result<-rnb.execute.low.coverage.masking(rnb.set, covg.threshold=BEAD_NUMBER_MIN) | |
covg_mask_result$dataset.before<-NULL | |
rnb.set<-covg_mask_result$dataset | |
covg_mask_result$dataset<-NULL | |
qc_df$Low_coverage_masked_count<-colSums(covg_mask_result$mask) | |
rem<-sum(qc_df$Low_coverage_masked_count) | |
all<-prod(dim(meth(rnb.set))) | |
cat(sprintf("[Low-coverage masking] masked %d values out of %d, %1.3f %%\n", rem, all, 100*rem/all)) | |
## 4.3 Intensity artefacts | |
#### no specialized function yet | |
## 4.4 Cross-reactive probes | |
cross_reactive_result<-rnb.execute.cross.reactive.removal(rnb.set) | |
all<-nrow(rnb.set@sites) | |
cross_reactive_result$dataset.before<-NULL | |
rnb.set<-cross_reactive_result$dataset | |
cross_reactive_result$dataset<-NULL | |
rem<-length(cross_reactive_result$filtered) | |
qc_df$Removed_cross_reactive<-rem | |
cat(sprintf("[Removing cross-reactive probes] removed %d values out of %d, %1.3f %%\n", rem, all, 100*rem/all)) | |
## 4.5. SNP-overlapping probes | |
### TODO: use MethylToSNP | |
#SNP_FILTER<-c("no", "3", "5", "any", "yes")[3L] | |
#*'filtering.snp'*' = "3"' Removal of sites or probes based on | |
#overlap with SNPs. The accepted values for this option are: | |
#"no" no SNP-based filtering; | |
#"3" filter out a probe when the last 3 bases in its target | |
# sequence overlap with SNP; | |
#"5" filter out a probe when the last 5 bases in its target | |
# sequence overlap with SNP; | |
#"any" or "yes" filter out a CpG site or probe when any | |
# base in its target sequence overlaps with SNP. | |
#Bisulfite sequencing datasets operate on sites instead of | |
#probes, therefore, the values '"3"' and '"5"' are treated as | |
#"yes" | |
snp_overlapping_result<-rnb.execute.snp.removal(rnb.set, snp=SNP_FILTER) | |
all<-nrow(rnb.set@sites) | |
snp_overlapping_result$dataset.before<-NULL | |
rnb.set<-snp_overlapping_result$dataset | |
snp_overlapping_result$dataset<-NULL | |
rem<-length(snp_overlapping_result$filtered) | |
qc_df$Removed_SNP_overlapping<-rem | |
cat(sprintf("[Removing cross-reactive probes] removed %d values out of %d, %1.3f %%\n", rem, all, 100*rem/all)) | |
## 4.6. Non-relevant contexts | |
#NON_CG_CONTEXT<-c("CC","CAG","CAH","CTG","CTH","Other") | |
context_result<-rnb.execute.context.removal(rnb.set, contexts=NON_CG_CONTEXT) | |
all<-nrow(rnb.set@sites) | |
context_result$dataset.before<-NULL | |
rnb.set<-context_result$dataset | |
context_result$dataset<-NULL | |
rem<-length(context_result$filtered) | |
qc_df$Removed_irrelevant_context<-rem | |
cat(sprintf("[Removing non-CpG probes] removed %d values out of %d, %1.3f %%\n", rem, all, 100*rem/all)) | |
## 4.7 Non-variable sites | |
if(FALSE){ | |
MIN_SD<-0.05 | |
variability_result<-rnb.execute.variability.removal(rnb.set, min.deviation=MIN_SD) | |
variability_result$dataset.before<-NULL | |
rnb.set<-variability_result$dataset | |
variability_result$dataset<-NULL | |
qc_df$Removed_non_variable<-length(variability_result$filtered) | |
} | |
## 4.8 Missing value removal | |
#NA_QUANTILE_PROBES<-0.01 | |
#threshold: Maximum quantile of 'NA's allowed per site. This must be a | |
# value between 0 and 1. | |
na_result<-rnb.execute.na.removal(rnb.set, threshold=NA_QUANTILE_PROBES) | |
all<-nrow(rnb.set@sites) | |
na_result$dataset.before<-NULL | |
rnb.set<-na_result$dataset | |
na_result$dataset<-NULL | |
rem<-length(na_result$filtered) | |
qc_df$Removed_missing_values<-rem | |
cat(sprintf("[Removing NA-value probes] removed %d values out of %d, %1.3f %%\n", rem, all, 100*rem/all)) | |
## 4.9 Sites on sex chromosomes --- irrelevant | |
#rnb.set<-rnb.execute.sex.removal(rnb.set) | |
## 4.10 Removing low-quality samples | |
NA_QUANTILE_SAMPLES<-0.002 | |
na.perc<-colSums(is.na(meth(rnb.set)))/nrow(rnb.set@sites) | |
qc_df$Fraction_missing_values<-na.perc | |
pdf(file.path(OUT_DIR, "NA_count_distribution.pdf"), width=7, height=7) | |
hist(na.perc, breaks=100) | |
dev.off() | |
pdf(file.path(OUT_DIR, "NA_count_distribution_zoom.pdf"), width=7, height=7) | |
hist(na.perc, breaks=1000,xlim=c(0, 0.02)) | |
dev.off() | |
low_quality_samples<-which(na.perc>NA_QUANTILE_SAMPLES) | |
qc_df$Low_quality<-"no" | |
qc_df$Low_quality[low_quality_samples]<-"yes" | |
cat(sprintf("[Low-quality samples] identified %d potentially low-quality samples", length(low_quality_samples), 100*length(low_quality_samples)/length(samples(rnb.set)))) | |
write.table(qc_df, file=file.path(OUT_DIR, "QC_summary_table.txt"), sep="\t", quote=FALSE) | |
#################################################### 5. Postprocessing QC and validation | |
#################################################### 6. Export | |
saveRDS(rnb.set, file=file.path(OUT_DIR, "rnb.set_filtered_normalized.RDS")) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment