Created
November 25, 2018 18:21
-
-
Save kevinrue/f0f26e4585b075caf6420f5453976d01 to your computer and use it in GitHub Desktop.
Seurat - Guided Clustering Tutorial
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Adapted from https://satijalab.org/seurat/pbmc3k_tutorial.html on 2018-11-24 | |
timestamp() | |
message("Started") | |
library(BiocFileCache) | |
library(Seurat) | |
library(dplyr) | |
# Download the data programmatically ---- | |
bfc <- BiocFileCache() | |
pbmc3k_rname <- "pbmc3k_filtered_gene_bc_matrices.tar.gz" | |
if (!pbmc3k_rname %in% bfcinfo(bfc)[, "rname", drop=TRUE]) { | |
rpath <- bfcadd( | |
x = bfc, | |
rname = "pbmc3k_filtered_gene_bc_matrices.tar.gz", | |
fpath = "https://s3-us-west-2.amazonaws.com/10x.files/samples/cell/pbmc3k/pbmc3k_filtered_gene_bc_matrices.tar.gz") | |
} else { | |
rpath <- subset(bfcinfo(bfc), pbmc3k_rname == rname, "rpath", drop=TRUE) | |
} | |
tempDir <- tempdir() | |
untar(rpath, exdir = tempDir) | |
# Summarized Seurat tutorial ---- | |
# Load the PBMC dataset | |
pbmc.data <- Read10X(data.dir = file.path(tempDir, "filtered_gene_bc_matrices", "hg19")) | |
# Initialize the Seurat object with the raw (non-normalized data). Keep all | |
# genes expressed in >= 3 cells (~0.1% of the data). Keep all cells with at | |
# least 200 detected genes | |
pbmc <- CreateSeuratObject(raw.data = pbmc.data, min.cells = 3, min.genes = 200, project = "10X_PBMC") | |
# The number of genes and UMIs (nGene and nUMI) are automatically calculated | |
# for every object by Seurat. For non-UMI data, nUMI represents the sum of | |
# the non-normalized values within a cell We calculate the percentage of | |
# mitochondrial genes here and store it in percent.mito using AddMetaData. | |
# We use [email protected] since this represents non-transformed and | |
# non-log-normalized counts The % of UMI mapping to MT-genes is a common | |
# scRNA-seq QC metric. | |
mito.genes <- grep(pattern = "^MT-", x = rownames(x = pbmc@data), value = TRUE) | |
percent.mito <- Matrix::colSums([email protected][mito.genes, ])/Matrix::colSums([email protected]) | |
# AddMetaData adds columns to [email protected], and is a great place to | |
# stash QC stats | |
pbmc <- AddMetaData(object = pbmc, metadata = percent.mito, col.name = "percent.mito") | |
# We filter out cells that have unique gene counts over 2,500 or less than | |
# 200 Note that low.thresholds and high.thresholds are used to define a | |
# 'gate'. -Inf and Inf should be used if you don't want a lower or upper | |
# threshold. | |
pbmc <- FilterCells(object = pbmc, subset.names = c("nGene", "percent.mito"), | |
low.thresholds = c(200, -Inf), high.thresholds = c(2500, 0.05)) | |
pbmc <- NormalizeData(object = pbmc, normalization.method = "LogNormalize", | |
scale.factor = 10000) | |
pbmc <- FindVariableGenes(object = pbmc, mean.function = ExpMean, dispersion.function = LogVMR, | |
x.low.cutoff = 0.0125, x.high.cutoff = 3, y.cutoff = 0.5) | |
pbmc <- ScaleData(object = pbmc, vars.to.regress = c("nUMI", "percent.mito")) | |
pbmc <- RunPCA(object = pbmc, pc.genes = [email protected], do.print = TRUE, pcs.print = 1:5, | |
genes.print = 5) | |
# save.SNN = T saves the SNN so that the clustering algorithm can be rerun | |
# using the same graph but with a different resolution value (see docs for | |
# full details) | |
pbmc <- FindClusters(object = pbmc, reduction.type = "pca", dims.use = 1:10, | |
resolution = 0.6, print.output = 0, save.SNN = TRUE) | |
pbmc <- RunTSNE(object = pbmc, dims.use = 1:10, do.fast = TRUE) | |
# Save the preprocessed Seurat object to BiocFileCache ---- | |
tempFile <- tempfile(fileext = ".rds") | |
saveRDS(pbmc, tempFile) | |
seurat_pbmc3k_rname <- "seurat_pbmc3k" | |
if (!seurat_pbmc3k_rname %in% bfcinfo(bfc)[, "rname", drop=TRUE]) { | |
rpath <- bfcadd( | |
x = bfc, | |
rname = seurat_pbmc3k_rname, | |
fpath = tempFile) | |
} else { | |
rpath <- subset(bfcinfo(bfc), seurat_pbmc3k_rname == rname, "rpath", drop=TRUE) | |
} | |
# Save the SingleCellExperiment object to BiocFileCache ---- | |
sce <- as.SingleCellExperiment(pbmc) | |
tempFile <- tempfile(fileext = ".rds") | |
saveRDS(sce, tempFile) | |
sce_pbmc3k_rname <- "sce_pbmc3k" | |
if (!sce_pbmc3k_rname %in% bfcinfo(bfc)[, "rname", drop=TRUE]) { | |
rpath <- bfcadd( | |
x = bfc, | |
rname = sce_pbmc3k_rname, | |
fpath = tempFile) | |
} else { | |
rpath <- subset(bfcinfo(bfc), sce_pbmc3k_rname == rname, "rpath", drop=TRUE) | |
} | |
message("Completed") | |
timestamp() | |
## Markers and cell type annotations by cluster | |
# Cluster ID Markers Cell Type | |
# 0 IL7R CD4 T cells | |
# 1 CD14, LYZ CD14+ Monocytes | |
# 2 MS4A1 B cells | |
# 3 CD8A CD8 T cells | |
# 4 FCGR3A, MS4A7 FCGR3A+ Monocytes | |
# 5 GNLY, NKG7 NK cells | |
# 6 FCER1A, CST3 Dendritic Cells | |
# 7 PPBP Megakaryocytes |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment