Last active
September 14, 2021 22:34
-
-
Save moldach/17e8f8b686c8399b96cba1a779733abe to your computer and use it in GitHub Desktop.
Troubleshooting QC for scRNAseq_clustering_comparison
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
--- | |
title: "Import and QC of Koh data set (SRP073808)" | |
output: html_document | |
editor_options: | |
chunk_output_type: console | |
--- | |
```{r load-packages} | |
suppressPackageStartupMessages({ | |
library(MultiAssayExperiment) | |
library(SingleCellExperiment) | |
library(scater) | |
library(scran) | |
library(dplyr) | |
library(ggplot2) | |
}) | |
``` | |
## Load MultiAssayExperiment object | |
```{r load-dataset} | |
if (file.exists("GSE60749-GPL13112.rds")){ | |
maex <- readRDS("GSE60749-GPL13112.rds") | |
} else { | |
download.file("http://imlspenticton.uzh.ch/robinson_lab/conquer/data-mae/GSE60749-GPL13112.rds", "GSE60749-GPL13112.rds") | |
maex <- readRDS("GSE60749-GPL13112.rds") | |
} | |
## Extract the gene-level length-scaled TPMs | |
cts <- assays(experiments(maex)[["gene"]])[["count_lstpm"]] | |
## Extract the phenotype data. | |
phn <- colData(maex) | |
phn$phenoid <- as.character(interaction(as.data.frame(phn[, c("source_name_ch1", | |
"characteristics_ch1.1")]))) | |
## Simplify labels | |
phn$phenoid <- plyr::revalue( | |
phn$phenoid, | |
c("Dgcr8 knockout mouse embryonic stem cells.culture conditions: serum+LIF" = "Dgcr8 knockout mouse serum+LIF", | |
"v6.5 mouse embryonic stem cells.culture conditions: 2i+LIF" = "v6.5 mouse 2i+LIF", | |
"v6.5 mouse embryonic stem cells.culture conditions: serum+LIF" = "v6.5 mouse serum+LIF") | |
) | |
table(phn$phenoid) | |
``` | |
## Create a SingleCellExperiment object | |
```{r create-sce} | |
stopifnot(all(colnames(cts) == rownames(phn))) | |
sce <- SingleCellExperiment( | |
assays = list(counts = cts), | |
colData = phn | |
) | |
## below doesn't work.... | |
# sce <- normalise(sce, exprs_values = "counts", return_log = TRUE, return_norm_as_exprs = TRUE) ## generates logcounts(sce) | |
sce <- normalize(sce, exprs_values = "counts", return_log = TRUE) # is this correct? Get warning message here that its using lib sizes as size factors | |
``` | |
Exclude features that are not expressed | |
```{r reduce-expression-matrix} | |
keep_features <- rowSums(counts(sce) > 0) > 0 | |
table(keep_features) | |
sce <- sce[keep_features, ] | |
dim(sce) | |
``` | |
## Identify the remaining ERCC spike-ins. | |
```{r ercc-mt} | |
is.spike <- grepl("^ERCC", rownames(sce)) | |
table(is.spike) | |
summary(colSums(counts(sce[is.spike, ]))) | |
isSpike(sce, "ERCC") <- is.spike | |
``` | |
## Calculate QC metrics | |
```{r QC} | |
sce <- calculateQCMetrics(sce, feature_controls = list(ERCC = grepl("^ERCC", rownames(sce)))) | |
``` | |
## Quality control using PCA on column data | |
We create a PCA plot based the quality metrics for each cell, e.g., the total | |
number of reads, the total number of features and the proportion of spike-in | |
reads. | |
```{r qc-pca} | |
# get an error from below code: | |
# Error in .local(x, ...) : unused argument (pca_data_input = "coldata") | |
# sce <- scater::runPCA(sce, pca_data_input = "coldata") | |
sce <- scater::runPCA(sce) # is this acceptable?? | |
scater::plotPCA(sce, colour_by = "phenoid") | |
``` | |
## Filter cells | |
We remove cells with log-library sizes (or total features) that are more than 3 | |
median absolute deviations (MADs) below the median log-library size (or total | |
features). | |
```{r histogram} | |
colData(sce)$libsize.drop <- isOutlier(sce$total_counts, nmads = 3, type = "lower", log = TRUE) | |
ggplot(as.data.frame(colData(sce)), aes(x = total_counts)) + | |
geom_histogram(bins = 20, fill = "grey80") + xlab("Total count") + | |
ylab("Number of cells") + | |
geom_vline(xintercept = min(sce$total_counts[!sce$libsize.drop]), | |
color = "red", linetype = "dashed") + | |
theme_bw() | |
# below throws an error: | |
# Error in log10(metric) : non-numeric argument to mathematical function | |
colData(sce)$feature.drop <- isOutlier(sce$total_features, nmads = 3, type = "lower", log = TRUE) | |
# get an error here too: | |
# Error in !sce$feature.drop : invalid argument type | |
ggplot(as.data.frame(colData(sce)), aes(x = total_features)) + | |
geom_histogram(bins = 20, fill = "grey80") + xlab("Number of detected features") + | |
ylab("Number of cells") + | |
geom_vline(xintercept = min(sce$total_features[!sce$feature.drop]), | |
color = "red", linetype = "dashed") + | |
theme_bw() | |
# get an error here also: | |
# Error in base::table(...) : all arguments must have the same length | |
table(libsize = sce$libsize.drop, feature = sce$feature.drop) | |
``` | |
We also filter out cells with a large fraction of ERCC reads. | |
```{r filter-ercc} | |
colData(sce)$spike.drop <- isOutlier(sce$pct_counts_ERCC, nmads = 3, type = "higher") | |
ggplot(as.data.frame(colData(sce)), aes(x = pct_counts_ERCC)) + | |
geom_histogram(bins = 20, fill = "grey80") + xlab("ERCC proportion (%)") + | |
ylab("Number of cells") + | |
geom_vline(xintercept = max(sce$pct_counts_ERCC[!sce$spike.drop]), | |
color = "red", linetype = "dashed") + | |
theme_bw() | |
table(sce$spike.drop) | |
sce <- sce[, !(sce$libsize.drop | sce$feature.drop | sce$spike.drop)] | |
dim(sce) | |
``` | |
## Quality control using highest expressed genes | |
```{r qc-filt} | |
plotQC(sce, type = "highest-expression", n = 50) | |
``` | |
## Data normalization | |
```{r sizefactors} | |
# get ann error from computeSumFactors | |
# Error in .computeSumFactors(assay(x, i = assay.type), subset.row = subset.row, : zero cells in one of the clusters | |
sce <- computeSumFactors(sce, sizes = pmin(ncol(sce), seq(20, 120, 20)), min.mean = 0.1) | |
summary(sizeFactors(sce)) | |
sce <- computeSpikeFactors(sce, general.use = FALSE) | |
``` | |
```{r normalization} | |
# same thing here with normalise/normalize... get an error and the "return_norm_as_exprs" param doesn't exist | |
sce <- normalise(sce, exprs_values = "counts", return_log = TRUE, | |
return_norm_as_exprs = TRUE) | |
sce <- normalise(sce, exprs_values = "counts", return_log = FALSE, | |
return_norm_as_exprs = FALSE) | |
``` | |
## Plot the proportion of explained variances | |
```{r explained-variance, warning = FALSE} | |
expl_vars <- c("phenoid", "log10_total_counts", "log10_total_features", "pct_dropout", | |
"pct_counts_top_200_features", "log10_counts_feature_controls", | |
"pct_counts_feature_controls") | |
plotQC(sce, type = "explanatory-variables", variables = expl_vars) | |
``` | |
## Plot t-SNE representations | |
```{r tSNE} | |
set.seed(1234) | |
sce <- runTSNE(sce, exprs_values = "logcounts", perplexity = 10) | |
plotTSNE(sce, colour_by = "phenoid") | |
plotTSNE(sce, colour_by = "total_features", size_by = "total_counts") | |
``` | |
## Save the normalized and cell filtered dataset | |
```{r save-data} | |
sce <- sce[!isSpike(sce, "ERCC"), ] | |
dim(sce) | |
table(sce$phenoid) | |
saveRDS(sce, file = "../../data/sce_full/sce_full_Kumar.rds") | |
``` | |
## Session info | |
```{r} | |
date() | |
sessionInfo() | |
``` | |
I am attesting that this GitHub handle moldach is linked to the Tezos account tz1epnSBiaR334dL3wuX2TN9S52EJMGeSkXn for tzprofiles | |
sig:edsigtmnQtArWu3h1TuFqrh6g6bRJSM45w2Pxd4CXyExqx6Yaf4fLVqmnBSTa3YGkKiaWt7rBxUhVmz4vJphpzcCueq8ptmfPAk |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment