Skip to content

Instantly share code, notes, and snippets.

@PoisonAlien
Created September 10, 2021 12:26
Show Gist options
  • Save PoisonAlien/19698548b50a2f9d32b0c8cfc9be9da6 to your computer and use it in GitHub Desktop.
Save PoisonAlien/19698548b50a2f9d32b0c8cfc9be9da6 to your computer and use it in GitHub Desktop.
####################################################################################
#
# Best-practice 450k/EPIC QC and preprocessing workflow for the PPCG project
#
# creator: Pavlo Lutsik
#
# 30.01.2021
####################################################################################
library(RnBeads)
library(grid)
library(wateRmelon)
libra
#devtools::load_all("~/RnBeads_mmbc/", reset=FALSE)
rnb.options(disk.dump.big.matrices=FALSE)
rnb.options(region.types=character(0))
# 0. Configuration (on b370-worker01)
DATA_DIR<-"../01-raw_data/GDCdata/TCGA-ACC/"
IDAT_DIR<-file.path(DATA_DIR)
#SAMPLE_SHEET<-file.path(DATA_DIR, "Annotation", "PPCA_Sample_annotation_an_PL.txt")
SAMPLE_SHEET<-file.path(DATA_DIR, "sample_table.csv")
#ID_COL="Meth_Local_Sample_ID"
ID_COL="Tumor_Sample_Barcode"
#OUT_DIR<-"/local3/PPCG/analysis/rnbeads/"
OUT_DIR<-"ACC"
dir.create(OUT_DIR, showWarnings = FALSE)
################################################### 1. Loading
sample_sheet<-read.csv(SAMPLE_SHEET, sep=",", as.is=TRUE)
rnb.options(identifiers.column=ID_COL)
#sample_sheet$barcode<-sample_sheet$Barcode
#sample_sheet$barcode<-gsub("_Grn|_Grn.idat|_Grb.idat|_Grn.idat.gz", "", sample_sheet$idat_grn_filename)
#sample_sheet<-sample_sheet[sample_sheet$Platform %in% c("450K", "450k"),]
#sample_sheet<-sample_sheet[sample_sheet[[ID_COL]] == "PPCG0388m_450k",]
#quick workaround
#sample_sheet<-sample_sheet[-which(duplicated(sample_sheet[[ID_COL]])),]
rnb.set.raw<-rnb.execute.import(data.source=list(IDAT_DIR, sample_sheet))
## save the dataset
#save.rnb.set(rnb.set.raw, path=file.path(DATA_DIR, "PPCG_rnb_set_raw"))
#################################################### 2. QC
### control boxplots
ctypes<-rnb.infinium.control.targets(rnb.set.raw@target)
plots<-lapply(ctypes, function (ct) rnb.plot.control.boxplot(rnb.set.raw, ct))
grb<-arrangeGrob(grobs=plots, respect=FALSE)
pdf(file.path(OUT_DIR, "qc_boxplots.pdf"), width=20, height=20)
grid.draw(grb)
dev.off()
### SNP heatmap
pdf(file.path(OUT_DIR, "snp_heatmap.pdf"), width=7, height=7)
rnb.plot.snp.heatmap(rnb.set.raw)
dev.off()
## experiment with the QC data
qc_data<-qc(rnb.set.raw)
qc_anno<-rnb.get.annotation(gsub("probes", "controls", rnb.set.raw@target))
colnames(qc_data[["Cy5"]])<-paste0(samples(rnb.set.raw), "_Grn")
colnames(qc_data[["Cy3"]])<-paste0(samples(rnb.set.raw), "_Red")
qc_data<-cbind(qc_anno, qc_data[["Cy5"]], qc_data[["Cy3"]])
####
qc_df<-data.frame(Sample=samples(rnb.set.raw), Group= OUT_DIR, Cohort = OUT_DIR)
## number of bad detection p-values
DPVAL_MAX<-0.05
dp<-dpval(rnb.set.raw)
qc_df$High_dpval_count<-colSums(dp>DPVAL_MAX)
p<-ggplot(qc_df, aes(x=Group, y=High_dpval_count)) + geom_violin() + geom_jitter(aes(color=Cohort, shape=Cohort))
p<-p+scale_shape_manual(values=5:13)
p<-p + ggtitle(paste("Number of detection P-values >", DPVAL_MAX))
pdf(file.path(OUT_DIR, "dpval_violin.pdf"), width=7, height=7)
print(p)
dev.off()
p<-p+scale_y_log10()
pdf(file.path(OUT_DIR, "dpval_violin_log10.pdf"), width=7, height=7)
print(p)
dev.off()
## median bead count distribution
BEAD_NUMBER_MIN<-3
bc<-covg(rnb.set.raw)
qc_df$Median_bead_counts<-colMedians(bc)
p<-ggplot(qc_df, aes(x=Group, y=Median_bead_counts)) + geom_violin() + geom_jitter(aes(color=Cohort, shape=Cohort))
p<-p+scale_shape_manual(values=5:13)
p<-p + ggtitle(paste("Median bead count"))
pdf(file.path(OUT_DIR, "bc_median.pdf"), width=7, height=7)
print(p)
dev.off()
qc_df$Low_bead_counts<-colSums(bc<BEAD_NUMBER_MIN)
p<-ggplot(qc_df, aes(x=Group, y=Low_bead_counts)) + geom_violin() + geom_jitter(aes(color=Cohort, shape=Cohort))
p<-p+scale_shape_manual(values=5:13)
p<-p + ggtitle(paste("Number of samples with bead count<",BEAD_NUMBER_MIN))
pdf(file.path(OUT_DIR, "bc_low.pdf"), width=7, height=7)
print(p)
dev.off()
###### SNP distances
mm<-meth(rnb.set.raw, row.names=TRUE)
mm.snp<-mm[grep("rs" ,rownames(mm)),]
dd.snp<-as.matrix(dist(t(mm.snp)))
dist.self<-diag(dd.snp)
snp_dist_df<-data.frame(Group="PPCG", Value=as.numeric(dd.snp[lower.tri(dd.snp)]))
p<-ggplot(snp_dist_df, aes(x=Group, y=Value)) + geom_violin() + geom_jitter(height=0)# aes(color=Cohort, shape=Cohort))
pdf(file.path(OUT_DIR, "snp_dist_distribution.pdf"), width=7, height=7)
print(p)
dev.off()
#################################################### 3. Normalization
NORM_METHOD<-"scaling"
#BG_METHOD<-"enmix.oob"
BG_METHOD<-"subtraction"
### workflow 107, the best from the internal benchmarking study
### pOOBAH filtering + methylumi NOOB bg subtraction + watermelon nanes + BMIQ
NORM_METHOD<-"wm.nanes+bmiq"
BG_METHOD<-"methylumi.noob"
rnb.set.norm<-rnb.execute.normalization(object = rnb.set.raw, method = NORM_METHOD, bgcorr.method = BG_METHOD, verbose = TRUE)
save.rnb.set(rnb.set.norm, path=file.path(DATA_DIR, "PPCG_rnb_set_raw"))
#################################################### 4. Filtering
#### PARAMETERS
BEAD_NUMBER_MIN<-3L
DPVAL_MAX<-0.05
SNP_FILTER<-c("no", "3", "5", "any", "yes")[3L]
NON_CG_CONTEXT<-c("CC","CAG","CAH","CTG","CTH","Other")
NA_QUANTILE_PROBES<-0.01 #threshold: Maximum quantile of 'NA's allowed per site. This must be a value between 0 and 1.
NA_QUANTILE_SAMPLES<-0.002
MAX_LOW_NBEADS<-0.01
MAX_HIGH_DPVAL<-0.01
##################
## 4.1 Greedycut
#Executes the Greedycut procedure for probe and sample filtering
#based on the detection p-values, and calculates statistics on its
#iterations.
#DPVAL_MAX<-0.05
#BEAD_NUMBER_MIN<-3L
#rnb.options(filtering.greedycut.pvalue.threshold=DPVAL_MAX)
#rnb.options(filtering.coverage.threshold=BEAD_NUMBER_MIN)
if(FALSE){
greedycut_result<-rnb.execute.greedycut(rnb.set.norm, pval.threshold=1, min.coverage=BEAD_NUMBER_MIN)
rem<-length(greedycut_result$sites)
qc_df$Greedycut_removed<-rem
all<-nrow(rnb.set.norm@sites)
cat(sprintf("[Greedycut] removed %d of %d probes\n", rem, all, 100*rem/all))
rnb.set<-remove.sites(rnb.set.norm, greedycut_result$sites)
if(FALSE){
rnb.set<-remove.samples(rnb.set, greedycut_result$samples)
}
}
## 4.1.1 POOBAH
## p-value with Out-Of-Band Array Hybridization
rnb.set.mod<-rnb.execute.pOOBAH(rnb.set, pval.thresh=DPVAL_MAX, verbose=TRUE)
na.orig<-is.na(meth(rnb.set))
na.poobah<-is.na(meth(rnb.set.mod))
na.poobah[na.orig]<-FALSE
qc_df$pOOBAH_masked_count<-colSums(na.poobah)
#rnb.set<-rnb.set.mod
## 4.2 Mask remaining low detection and low bead number
dpval_mask_result<-rnb.execute.high.dpval.masking(rnb.set.mod, dpval.threshold=DPVAL_MAX)
dpval_mask_result$dataset.before<-NULL
rnb.set<-dpval_mask_result$dataset
dpval_mask_result$dataset<-NULL
qc_df$High_dpval_masked_count<-colSums(dpval_mask_result$mask)
rem<-sum(qc_df$High_dpval_masked_count)
all<-prod(dim(meth(rnb.set)))
cat(sprintf("[High detection p-value masking] masked %d values out of %d, %1.3f %%\n", rem, all, 100*rem/all))
######
covg_mask_result<-rnb.execute.low.coverage.masking(rnb.set, covg.threshold=BEAD_NUMBER_MIN)
covg_mask_result$dataset.before<-NULL
rnb.set<-covg_mask_result$dataset
covg_mask_result$dataset<-NULL
qc_df$Low_coverage_masked_count<-colSums(covg_mask_result$mask)
rem<-sum(qc_df$Low_coverage_masked_count)
all<-prod(dim(meth(rnb.set)))
cat(sprintf("[Low-coverage masking] masked %d values out of %d, %1.3f %%\n", rem, all, 100*rem/all))
## 4.3 Intensity artefacts
#### no specialized function yet
## 4.4 Cross-reactive probes
cross_reactive_result<-rnb.execute.cross.reactive.removal(rnb.set)
all<-nrow(rnb.set@sites)
cross_reactive_result$dataset.before<-NULL
rnb.set<-cross_reactive_result$dataset
cross_reactive_result$dataset<-NULL
rem<-length(cross_reactive_result$filtered)
qc_df$Removed_cross_reactive<-rem
cat(sprintf("[Removing cross-reactive probes] removed %d values out of %d, %1.3f %%\n", rem, all, 100*rem/all))
## 4.5. SNP-overlapping probes
### TODO: use MethylToSNP
#SNP_FILTER<-c("no", "3", "5", "any", "yes")[3L]
#*'filtering.snp'*' = "3"' Removal of sites or probes based on
#overlap with SNPs. The accepted values for this option are:
#"no" no SNP-based filtering;
#"3" filter out a probe when the last 3 bases in its target
# sequence overlap with SNP;
#"5" filter out a probe when the last 5 bases in its target
# sequence overlap with SNP;
#"any" or "yes" filter out a CpG site or probe when any
# base in its target sequence overlaps with SNP.
#Bisulfite sequencing datasets operate on sites instead of
#probes, therefore, the values '"3"' and '"5"' are treated as
#"yes"
snp_overlapping_result<-rnb.execute.snp.removal(rnb.set, snp=SNP_FILTER)
all<-nrow(rnb.set@sites)
snp_overlapping_result$dataset.before<-NULL
rnb.set<-snp_overlapping_result$dataset
snp_overlapping_result$dataset<-NULL
rem<-length(snp_overlapping_result$filtered)
qc_df$Removed_SNP_overlapping<-rem
cat(sprintf("[Removing cross-reactive probes] removed %d values out of %d, %1.3f %%\n", rem, all, 100*rem/all))
## 4.6. Non-relevant contexts
#NON_CG_CONTEXT<-c("CC","CAG","CAH","CTG","CTH","Other")
context_result<-rnb.execute.context.removal(rnb.set, contexts=NON_CG_CONTEXT)
all<-nrow(rnb.set@sites)
context_result$dataset.before<-NULL
rnb.set<-context_result$dataset
context_result$dataset<-NULL
rem<-length(context_result$filtered)
qc_df$Removed_irrelevant_context<-rem
cat(sprintf("[Removing non-CpG probes] removed %d values out of %d, %1.3f %%\n", rem, all, 100*rem/all))
## 4.7 Non-variable sites
if(FALSE){
MIN_SD<-0.05
variability_result<-rnb.execute.variability.removal(rnb.set, min.deviation=MIN_SD)
variability_result$dataset.before<-NULL
rnb.set<-variability_result$dataset
variability_result$dataset<-NULL
qc_df$Removed_non_variable<-length(variability_result$filtered)
}
## 4.8 Missing value removal
#NA_QUANTILE_PROBES<-0.01
#threshold: Maximum quantile of 'NA's allowed per site. This must be a
# value between 0 and 1.
na_result<-rnb.execute.na.removal(rnb.set, threshold=NA_QUANTILE_PROBES)
all<-nrow(rnb.set@sites)
na_result$dataset.before<-NULL
rnb.set<-na_result$dataset
na_result$dataset<-NULL
rem<-length(na_result$filtered)
qc_df$Removed_missing_values<-rem
cat(sprintf("[Removing NA-value probes] removed %d values out of %d, %1.3f %%\n", rem, all, 100*rem/all))
## 4.9 Sites on sex chromosomes --- irrelevant
#rnb.set<-rnb.execute.sex.removal(rnb.set)
## 4.10 Removing low-quality samples
NA_QUANTILE_SAMPLES<-0.002
na.perc<-colSums(is.na(meth(rnb.set)))/nrow(rnb.set@sites)
qc_df$Fraction_missing_values<-na.perc
pdf(file.path(OUT_DIR, "NA_count_distribution.pdf"), width=7, height=7)
hist(na.perc, breaks=100)
dev.off()
pdf(file.path(OUT_DIR, "NA_count_distribution_zoom.pdf"), width=7, height=7)
hist(na.perc, breaks=1000,xlim=c(0, 0.02))
dev.off()
low_quality_samples<-which(na.perc>NA_QUANTILE_SAMPLES)
qc_df$Low_quality<-"no"
qc_df$Low_quality[low_quality_samples]<-"yes"
cat(sprintf("[Low-quality samples] identified %d potentially low-quality samples", length(low_quality_samples), 100*length(low_quality_samples)/length(samples(rnb.set))))
write.table(qc_df, file=file.path(OUT_DIR, "QC_summary_table.txt"), sep="\t", quote=FALSE)
#################################################### 5. Postprocessing QC and validation
#################################################### 6. Export
saveRDS(rnb.set, file=file.path(OUT_DIR, "rnb.set_filtered_normalized.RDS"))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment