kevinrue · November 25, 2018 18:21
diff --git a/Seurat_PBMC3k_tutorial.R b/Seurat_PBMC3k_tutorial.R

 # Adapted from https://satijalab.org/seurat/pbmc3k_tutorial.html on 2018-11-24

 timestamp()
 message("Started")

 library(BiocFileCache)
 library(Seurat)
 library(dplyr)

 # Download the data programmatically ----
 bfc <- BiocFileCache()
 pbmc3k_rname <- "pbmc3k_filtered_gene_bc_matrices.tar.gz"
 if (!pbmc3k_rname %in% bfcinfo(bfc)[, "rname", drop=TRUE]) {
    rpath <- bfcadd(
        x = bfc,
        rname = "pbmc3k_filtered_gene_bc_matrices.tar.gz",
        fpath = "https://s3-us-west-2.amazonaws.com/10x.files/samples/cell/pbmc3k/pbmc3k_filtered_gene_bc_matrices.tar.gz")
 } else {
    rpath <- subset(bfcinfo(bfc), pbmc3k_rname == rname, "rpath", drop=TRUE)
 }

 tempDir <- tempdir()
 untar(rpath, exdir = tempDir)

 # Summarized Seurat tutorial ----

 # Load the PBMC dataset
 pbmc.data <- Read10X(data.dir = file.path(tempDir, "filtered_gene_bc_matrices", "hg19"))

 # Initialize the Seurat object with the raw (non-normalized data).  Keep all
 # genes expressed in >= 3 cells (~0.1% of the data). Keep all cells with at
 # least 200 detected genes
 pbmc <- CreateSeuratObject(raw.data = pbmc.data, min.cells = 3, min.genes = 200, project = "10X_PBMC")

 # The number of genes and UMIs (nGene and nUMI) are automatically calculated
 # for every object by Seurat.  For non-UMI data, nUMI represents the sum of
 # the non-normalized values within a cell We calculate the percentage of
 # mitochondrial genes here and store it in percent.mito using AddMetaData.
 # We use [email protected] since this represents non-transformed and
 # non-log-normalized counts The % of UMI mapping to MT-genes is a common
 # scRNA-seq QC metric.
 mito.genes <- grep(pattern = "^MT-", x = rownames(x = pbmc@data), value = TRUE)
 percent.mito <- Matrix::colSums([email protected][mito.genes, ])/Matrix::colSums([email protected])

 # AddMetaData adds columns to [email protected], and is a great place to
 # stash QC stats
 pbmc <- AddMetaData(object = pbmc, metadata = percent.mito, col.name = "percent.mito")

 # We filter out cells that have unique gene counts over 2,500 or less than
 # 200 Note that low.thresholds and high.thresholds are used to define a
 # 'gate'.  -Inf and Inf should be used if you don't want a lower or upper
 # threshold.
 pbmc <- FilterCells(object = pbmc, subset.names = c("nGene", "percent.mito"), 
                    low.thresholds = c(200, -Inf), high.thresholds = c(2500, 0.05))

 pbmc <- NormalizeData(object = pbmc, normalization.method = "LogNormalize", 
                      scale.factor = 10000)

 pbmc <- FindVariableGenes(object = pbmc, mean.function = ExpMean, dispersion.function = LogVMR, 
                          x.low.cutoff = 0.0125, x.high.cutoff = 3, y.cutoff = 0.5)

 pbmc <- ScaleData(object = pbmc, vars.to.regress = c("nUMI", "percent.mito"))

 pbmc <- RunPCA(object = pbmc, pc.genes = [email protected], do.print = TRUE, pcs.print = 1:5, 
               genes.print = 5)

 # save.SNN = T saves the SNN so that the clustering algorithm can be rerun
 # using the same graph but with a different resolution value (see docs for
 # full details)
 pbmc <- FindClusters(object = pbmc, reduction.type = "pca", dims.use = 1:10, 
                     resolution = 0.6, print.output = 0, save.SNN = TRUE)

 pbmc <- RunTSNE(object = pbmc, dims.use = 1:10, do.fast = TRUE)

 # Save the preprocessed Seurat object to BiocFileCache ----

 tempFile <- tempfile(fileext = ".rds")
 saveRDS(pbmc, tempFile)

 seurat_pbmc3k_rname <- "seurat_pbmc3k"
 if (!seurat_pbmc3k_rname %in% bfcinfo(bfc)[, "rname", drop=TRUE]) {
    rpath <- bfcadd(
        x = bfc,
        rname = seurat_pbmc3k_rname,
        fpath = tempFile)
 } else {
    rpath <- subset(bfcinfo(bfc), seurat_pbmc3k_rname == rname, "rpath", drop=TRUE)
 }

 # Save the SingleCellExperiment object to BiocFileCache ----

 sce <- as.SingleCellExperiment(pbmc)

 tempFile <- tempfile(fileext = ".rds")
 saveRDS(sce, tempFile)

 sce_pbmc3k_rname <- "sce_pbmc3k"
 if (!sce_pbmc3k_rname %in% bfcinfo(bfc)[, "rname", drop=TRUE]) {
    rpath <- bfcadd(
        x = bfc,
        rname = sce_pbmc3k_rname,
        fpath = tempFile)
 } else {
    rpath <- subset(bfcinfo(bfc), sce_pbmc3k_rname == rname, "rpath", drop=TRUE)
 }

 message("Completed")
 timestamp()

 ## Markers and cell type annotations by cluster
 # Cluster ID	Markers	        Cell Type
 # 0             IL7R	        CD4 T cells
 # 1             CD14, LYZ	    CD14+ Monocytes
 # 2	            MS4A1           B cells
 # 3	            CD8A	        CD8 T cells
 # 4	            FCGR3A, MS4A7	FCGR3A+ Monocytes
 # 5	            GNLY, NKG7	    NK cells
 # 6	            FCER1A, CST3	Dendritic Cells
 # 7	            PPBP	        Megakaryocytes

	# Adapted from https://satijalab.org/seurat/pbmc3k_tutorial.html on 2018-11-24

	timestamp()
	message("Started")

	library(BiocFileCache)
	library(Seurat)
	library(dplyr)

	# Download the data programmatically ----
	bfc <- BiocFileCache()
	pbmc3k_rname <- "pbmc3k_filtered_gene_bc_matrices.tar.gz"
	if (!pbmc3k_rname %in% bfcinfo(bfc)[, "rname", drop=TRUE]) {
	rpath <- bfcadd(
	x = bfc,
	rname = "pbmc3k_filtered_gene_bc_matrices.tar.gz",
	fpath = "https://s3-us-west-2.amazonaws.com/10x.files/samples/cell/pbmc3k/pbmc3k_filtered_gene_bc_matrices.tar.gz")
	} else {
	rpath <- subset(bfcinfo(bfc), pbmc3k_rname == rname, "rpath", drop=TRUE)
	}

	tempDir <- tempdir()
	untar(rpath, exdir = tempDir)

	# Summarized Seurat tutorial ----

	# Load the PBMC dataset
	pbmc.data <- Read10X(data.dir = file.path(tempDir, "filtered_gene_bc_matrices", "hg19"))

	# Initialize the Seurat object with the raw (non-normalized data). Keep all
	# genes expressed in >= 3 cells (~0.1% of the data). Keep all cells with at
	# least 200 detected genes
	pbmc <- CreateSeuratObject(raw.data = pbmc.data, min.cells = 3, min.genes = 200, project = "10X_PBMC")

	# The number of genes and UMIs (nGene and nUMI) are automatically calculated
	# for every object by Seurat. For non-UMI data, nUMI represents the sum of
	# the non-normalized values within a cell We calculate the percentage of
	# mitochondrial genes here and store it in percent.mito using AddMetaData.
	# We use [email protected] since this represents non-transformed and
	# non-log-normalized counts The % of UMI mapping to MT-genes is a common
	# scRNA-seq QC metric.
	mito.genes <- grep(pattern = "^MT-", x = rownames(x = pbmc@data), value = TRUE)
	percent.mito <- Matrix::colSums([email protected][mito.genes, ])/Matrix::colSums([email protected])

	# AddMetaData adds columns to [email protected], and is a great place to
	# stash QC stats
	pbmc <- AddMetaData(object = pbmc, metadata = percent.mito, col.name = "percent.mito")

	# We filter out cells that have unique gene counts over 2,500 or less than
	# 200 Note that low.thresholds and high.thresholds are used to define a
	# 'gate'. -Inf and Inf should be used if you don't want a lower or upper
	# threshold.
	pbmc <- FilterCells(object = pbmc, subset.names = c("nGene", "percent.mito"),
	low.thresholds = c(200, -Inf), high.thresholds = c(2500, 0.05))

	pbmc <- NormalizeData(object = pbmc, normalization.method = "LogNormalize",
	scale.factor = 10000)

	pbmc <- FindVariableGenes(object = pbmc, mean.function = ExpMean, dispersion.function = LogVMR,
	x.low.cutoff = 0.0125, x.high.cutoff = 3, y.cutoff = 0.5)

	pbmc <- ScaleData(object = pbmc, vars.to.regress = c("nUMI", "percent.mito"))

	pbmc <- RunPCA(object = pbmc, pc.genes = [email protected], do.print = TRUE, pcs.print = 1:5,
	genes.print = 5)

	# save.SNN = T saves the SNN so that the clustering algorithm can be rerun
	# using the same graph but with a different resolution value (see docs for
	# full details)
	pbmc <- FindClusters(object = pbmc, reduction.type = "pca", dims.use = 1:10,
	resolution = 0.6, print.output = 0, save.SNN = TRUE)

	pbmc <- RunTSNE(object = pbmc, dims.use = 1:10, do.fast = TRUE)

	# Save the preprocessed Seurat object to BiocFileCache ----

	tempFile <- tempfile(fileext = ".rds")
	saveRDS(pbmc, tempFile)

	seurat_pbmc3k_rname <- "seurat_pbmc3k"
	if (!seurat_pbmc3k_rname %in% bfcinfo(bfc)[, "rname", drop=TRUE]) {
	rpath <- bfcadd(
	x = bfc,
	rname = seurat_pbmc3k_rname,
	fpath = tempFile)
	} else {
	rpath <- subset(bfcinfo(bfc), seurat_pbmc3k_rname == rname, "rpath", drop=TRUE)
	}

	# Save the SingleCellExperiment object to BiocFileCache ----

	sce <- as.SingleCellExperiment(pbmc)

	tempFile <- tempfile(fileext = ".rds")
	saveRDS(sce, tempFile)

	sce_pbmc3k_rname <- "sce_pbmc3k"
	if (!sce_pbmc3k_rname %in% bfcinfo(bfc)[, "rname", drop=TRUE]) {
	rpath <- bfcadd(
	x = bfc,
	rname = sce_pbmc3k_rname,
	fpath = tempFile)
	} else {
	rpath <- subset(bfcinfo(bfc), sce_pbmc3k_rname == rname, "rpath", drop=TRUE)
	}

	message("Completed")
	timestamp()

	## Markers and cell type annotations by cluster
	# Cluster ID Markers Cell Type
	# 0 IL7R CD4 T cells
	# 1 CD14, LYZ CD14+ Monocytes
	# 2 MS4A1 B cells
	# 3 CD8A CD8 T cells
	# 4 FCGR3A, MS4A7 FCGR3A+ Monocytes
	# 5 GNLY, NKG7 NK cells
	# 6 FCER1A, CST3 Dendritic Cells
	# 7 PPBP Megakaryocytes