kevinrue · November 27, 2018 21:51
diff --git a/Seurat_PBMC3k_TENxPBMCData_tutorial.Rmd b/Seurat_PBMC3k_TENxPBMCData_tutorial.Rmd
 ---
 title: "Seurat PBMC 3k tutorial using TENxPBMCData"
 author: "Kevin Rue-Albrecht"
 date: "27/11/2018"
 output: html_document
 editor_options: 
  chunk_output_type: console
 ---

 ```{r setup, include=FALSE}
 knitr::opts_chunk$set(echo = TRUE)
 ```

 Fetch the SingleCellExperiment object using the `TENxPBMCData`.
 The first time download from the web and cache locally; subsequently from the local cache.

 ```{r}
 library(TENxPBMCData)
 tenx_pbmc3k <- TENxPBMCData(dataset = "pbmc3k")
 tenx_pbmc3k
 ```

 Prepare a sparse matrix that emulates the first section of the tutorial.

 ```{r}
 library(Matrix)
 pbmc.data <- as(counts(tenx_pbmc3k), "Matrix")
 pbmc.data <- as(pbmc.data, "dgTMatrix")
 colnames(pbmc.data) <- paste0("Cell", seq_len(ncol(pbmc.data)))
 rownames(pbmc.data) <- make.unique(rowData(tenx_pbmc3k)[, "Symbol_TENx", drop=TRUE])
 ```

 From here on, follow the Seurat tutorial to the letter.

 ```{r}
 library(Seurat)
 library(dplyr)
 # Initialize the Seurat object with the raw (non-normalized data).  Keep all
 # genes expressed in >= 3 cells (~0.1% of the data). Keep all cells with at
 # least 200 detected genes
 pbmc <- CreateSeuratObject(raw.data = pbmc.data, min.cells = 3, min.genes = 200, project = "10X_PBMC")
 ```

 ```{r}
 # The number of genes and UMIs (nGene and nUMI) are automatically calculated
 # for every object by Seurat.  For non-UMI data, nUMI represents the sum of
 # the non-normalized values within a cell We calculate the percentage of
 # mitochondrial genes here and store it in percent.mito using AddMetaData.
 # We use [email protected] since this represents non-transformed and
 # non-log-normalized counts The % of UMI mapping to MT-genes is a common
 # scRNA-seq QC metric.
 mito.genes <- grep(pattern = "^MT-", x = rownames(x = pbmc@data), value = TRUE)
 percent.mito <- Matrix::colSums([email protected][mito.genes, ])/Matrix::colSums([email protected])

 # AddMetaData adds columns to [email protected], and is a great place to
 # stash QC stats
 pbmc <- AddMetaData(object = pbmc, metadata = percent.mito, col.name = "percent.mito")
 VlnPlot(object = pbmc, features.plot = c("nGene", "nUMI", "percent.mito"), nCol = 3)
 ```

 ```{r}
 # GenePlot is typically used to visualize gene-gene relationships, but can
 # be used for anything calculated by the object, i.e. columns in
 # [email protected], PC scores etc.  Since there is a rare subset of cells
 # with an outlier level of high mitochondrial percentage and also low UMI
 # content, we filter these as well
 par(mfrow = c(1, 2))
 GenePlot(object = pbmc, gene1 = "nUMI", gene2 = "percent.mito")
 GenePlot(object = pbmc, gene1 = "nUMI", gene2 = "nGene")
 ```

 ```{r}
 # We filter out cells that have unique gene counts over 2,500 or less than
 # 200 Note that low.thresholds and high.thresholds are used to define a
 # 'gate'.  -Inf and Inf should be used if you don't want a lower or upper
 # threshold.
 pbmc <- FilterCells(object = pbmc, subset.names = c("nGene", "percent.mito"), 
    low.thresholds = c(200, -Inf), high.thresholds = c(2500, 0.05))
 ```

 ```{r}
 pbmc <- NormalizeData(object = pbmc, normalization.method = "LogNormalize", 
    scale.factor = 10000)
 ```

 ```{r}
 pbmc <- FindVariableGenes(object = pbmc, mean.function = ExpMean, dispersion.function = LogVMR, 
    x.low.cutoff = 0.0125, x.high.cutoff = 3, y.cutoff = 0.5)
 ```

 ```{r}
 length(x = [email protected])
 ```

 ```{r}
 pbmc <- ScaleData(object = pbmc, vars.to.regress = c("nUMI", "percent.mito"))
 ```

 ```{r}
 pbmc <- RunPCA(object = pbmc, pc.genes = [email protected], do.print = TRUE, pcs.print = 1:5, 
    genes.print = 5)
 ```

 ```{r}
 # Examine and visualize PCA results a few different ways
 PrintPCA(object = pbmc, pcs.print = 1:5, genes.print = 5, use.full = FALSE)
 ```

 ```{r}
 VizPCA(object = pbmc, pcs.use = 1:2)
 ```

 ```{r}
 PCAPlot(object = pbmc, dim.1 = 1, dim.2 = 2)
 ```

 ```{r}
 # ProjectPCA scores each gene in the dataset (including genes not included
 # in the PCA) based on their correlation with the calculated components.
 # Though we don't use this further here, it can be used to identify markers
 # that are strongly correlated with cellular heterogeneity, but may not have
 # passed through variable gene selection.  The results of the projected PCA
 # can be explored by setting use.full=T in the functions above
 pbmc <- ProjectPCA(object = pbmc, do.print = FALSE)
 ```

 ```{r}
 PCHeatmap(object = pbmc, pc.use = 1, cells.use = 500, do.balanced = TRUE, label.columns = FALSE)
 ```

 ```{r}
 PCHeatmap(object = pbmc, pc.use = 1:12, cells.use = 500, do.balanced = TRUE, 
    label.columns = FALSE, use.full = FALSE)
 ```

 Small deviation from the tutorial. Skip the lengthy JackStraw computation.

 ```{r}
 # NOTE: This process can take a long time for big datasets, comment out for
 # expediency.  More approximate techniques such as those implemented in
 # PCElbowPlot() can be used to reduce computation time
 if (FALSE) {
    pbmc <- JackStraw(object = pbmc, num.replicate = 100, display.progress = FALSE)
    JackStrawPlot(object = pbmc, PCs = 1:12)
 }
 ```

 ```{r}
 PCElbowPlot(object = pbmc)
 ```

 ```{r}
 # save.SNN = T saves the SNN so that the clustering algorithm can be rerun
 # using the same graph but with a different resolution value (see docs for
 # full details)
 pbmc <- FindClusters(object = pbmc, reduction.type = "pca", dims.use = 1:10, 
    resolution = 0.6, print.output = 0, save.SNN = TRUE)
 ```

 ```{r}
 PrintFindClustersParams(object = pbmc)
 ```

 ```{r}
 pbmc <- RunTSNE(object = pbmc, dims.use = 1:10, do.fast = TRUE)
 ```

 ```{r}
 # note that you can set do.label=T to help label individual clusters
 TSNEPlot(object = pbmc)
 ```

 Save the `seurat` object

 ```{r}
 saveRDS(pbmc, file = "pbmc3k_tutorial.rds")
 ```

 Save the equivalent `SingleCellExperiment` object

 ```{r}
 sce_pbmc <- as.SingleCellExperiment(pbmc)
 saveRDS(sce_pbmc, file = "pbmc3k_tutorial.sce.rds")
 ```

 Note that the marker genes used to later assign cell identities to the various clusters are listed below

 ```{r}
 ## Markers and cell type annotations by cluster
 # Cluster ID	Markers	        Cell Type
 # 0             IL7R	        CD4 T cells
 # 1             CD14, LYZ	    CD14+ Monocytes
 # 2	            MS4A1           B cells
 # 3	            CD8A	        CD8 T cells
 # 4	            FCGR3A, MS4A7	FCGR3A+ Monocytes
 # 5	            GNLY, NKG7	    NK cells
 # 6	            FCER1A, CST3	Dendritic Cells
 # 7	            PPBP	        Megakaryocytes
 ```
	---
	title: "Seurat PBMC 3k tutorial using TENxPBMCData"
	author: "Kevin Rue-Albrecht"
	date: "27/11/2018"
	output: html_document
	editor_options:
	chunk_output_type: console
	---

	```{r setup, include=FALSE}
	knitr::opts_chunk$set(echo = TRUE)
	```

	Fetch the SingleCellExperiment object using the `TENxPBMCData`.
	The first time download from the web and cache locally; subsequently from the local cache.

	```{r}
	library(TENxPBMCData)
	tenx_pbmc3k <- TENxPBMCData(dataset = "pbmc3k")
	tenx_pbmc3k
	```

	Prepare a sparse matrix that emulates the first section of the tutorial.

	```{r}
	library(Matrix)
	pbmc.data <- as(counts(tenx_pbmc3k), "Matrix")
	pbmc.data <- as(pbmc.data, "dgTMatrix")
	colnames(pbmc.data) <- paste0("Cell", seq_len(ncol(pbmc.data)))
	rownames(pbmc.data) <- make.unique(rowData(tenx_pbmc3k)[, "Symbol_TENx", drop=TRUE])
	```

	From here on, follow the Seurat tutorial to the letter.

	```{r}
	library(Seurat)
	library(dplyr)
	# Initialize the Seurat object with the raw (non-normalized data). Keep all
	# genes expressed in >= 3 cells (~0.1% of the data). Keep all cells with at
	# least 200 detected genes
	pbmc <- CreateSeuratObject(raw.data = pbmc.data, min.cells = 3, min.genes = 200, project = "10X_PBMC")
	```

	```{r}
	# The number of genes and UMIs (nGene and nUMI) are automatically calculated
	# for every object by Seurat. For non-UMI data, nUMI represents the sum of
	# the non-normalized values within a cell We calculate the percentage of
	# mitochondrial genes here and store it in percent.mito using AddMetaData.
	# We use [email protected] since this represents non-transformed and
	# non-log-normalized counts The % of UMI mapping to MT-genes is a common
	# scRNA-seq QC metric.
	mito.genes <- grep(pattern = "^MT-", x = rownames(x = pbmc@data), value = TRUE)
	percent.mito <- Matrix::colSums([email protected][mito.genes, ])/Matrix::colSums([email protected])

	# AddMetaData adds columns to [email protected], and is a great place to
	# stash QC stats
	pbmc <- AddMetaData(object = pbmc, metadata = percent.mito, col.name = "percent.mito")
	VlnPlot(object = pbmc, features.plot = c("nGene", "nUMI", "percent.mito"), nCol = 3)
	```

	```{r}
	# GenePlot is typically used to visualize gene-gene relationships, but can
	# be used for anything calculated by the object, i.e. columns in
	# [email protected], PC scores etc. Since there is a rare subset of cells
	# with an outlier level of high mitochondrial percentage and also low UMI
	# content, we filter these as well
	par(mfrow = c(1, 2))
	GenePlot(object = pbmc, gene1 = "nUMI", gene2 = "percent.mito")
	GenePlot(object = pbmc, gene1 = "nUMI", gene2 = "nGene")
	```

	```{r}
	# We filter out cells that have unique gene counts over 2,500 or less than
	# 200 Note that low.thresholds and high.thresholds are used to define a
	# 'gate'. -Inf and Inf should be used if you don't want a lower or upper
	# threshold.
	pbmc <- FilterCells(object = pbmc, subset.names = c("nGene", "percent.mito"),
	low.thresholds = c(200, -Inf), high.thresholds = c(2500, 0.05))
	```

	```{r}
	pbmc <- NormalizeData(object = pbmc, normalization.method = "LogNormalize",
	scale.factor = 10000)
	```

	```{r}
	pbmc <- FindVariableGenes(object = pbmc, mean.function = ExpMean, dispersion.function = LogVMR,
	x.low.cutoff = 0.0125, x.high.cutoff = 3, y.cutoff = 0.5)
	```

	```{r}
	length(x = [email protected])
	```

	```{r}
	pbmc <- ScaleData(object = pbmc, vars.to.regress = c("nUMI", "percent.mito"))
	```

	```{r}
	pbmc <- RunPCA(object = pbmc, pc.genes = [email protected], do.print = TRUE, pcs.print = 1:5,
	genes.print = 5)
	```

	```{r}
	# Examine and visualize PCA results a few different ways
	PrintPCA(object = pbmc, pcs.print = 1:5, genes.print = 5, use.full = FALSE)
	```

	```{r}
	VizPCA(object = pbmc, pcs.use = 1:2)
	```

	```{r}
	PCAPlot(object = pbmc, dim.1 = 1, dim.2 = 2)
	```

	```{r}
	# ProjectPCA scores each gene in the dataset (including genes not included
	# in the PCA) based on their correlation with the calculated components.
	# Though we don't use this further here, it can be used to identify markers
	# that are strongly correlated with cellular heterogeneity, but may not have
	# passed through variable gene selection. The results of the projected PCA
	# can be explored by setting use.full=T in the functions above
	pbmc <- ProjectPCA(object = pbmc, do.print = FALSE)
	```

	```{r}
	PCHeatmap(object = pbmc, pc.use = 1, cells.use = 500, do.balanced = TRUE, label.columns = FALSE)
	```

	```{r}
	PCHeatmap(object = pbmc, pc.use = 1:12, cells.use = 500, do.balanced = TRUE,
	label.columns = FALSE, use.full = FALSE)
	```

	Small deviation from the tutorial. Skip the lengthy JackStraw computation.

	```{r}
	# NOTE: This process can take a long time for big datasets, comment out for
	# expediency. More approximate techniques such as those implemented in
	# PCElbowPlot() can be used to reduce computation time
	if (FALSE) {
	pbmc <- JackStraw(object = pbmc, num.replicate = 100, display.progress = FALSE)
	JackStrawPlot(object = pbmc, PCs = 1:12)
	}
	```

	```{r}
	PCElbowPlot(object = pbmc)
	```

	```{r}
	# save.SNN = T saves the SNN so that the clustering algorithm can be rerun
	# using the same graph but with a different resolution value (see docs for
	# full details)
	pbmc <- FindClusters(object = pbmc, reduction.type = "pca", dims.use = 1:10,
	resolution = 0.6, print.output = 0, save.SNN = TRUE)
	```

	```{r}
	PrintFindClustersParams(object = pbmc)
	```

	```{r}
	pbmc <- RunTSNE(object = pbmc, dims.use = 1:10, do.fast = TRUE)
	```

	```{r}
	# note that you can set do.label=T to help label individual clusters
	TSNEPlot(object = pbmc)
	```

	Save the `seurat` object

	```{r}
	saveRDS(pbmc, file = "pbmc3k_tutorial.rds")
	```

	Save the equivalent `SingleCellExperiment` object

	```{r}
	sce_pbmc <- as.SingleCellExperiment(pbmc)
	saveRDS(sce_pbmc, file = "pbmc3k_tutorial.sce.rds")
	```

	Note that the marker genes used to later assign cell identities to the various clusters are listed below

	```{r}
	## Markers and cell type annotations by cluster
	# Cluster ID Markers Cell Type
	# 0 IL7R CD4 T cells
	# 1 CD14, LYZ CD14+ Monocytes
	# 2 MS4A1 B cells
	# 3 CD8A CD8 T cells
	# 4 FCGR3A, MS4A7 FCGR3A+ Monocytes
	# 5 GNLY, NKG7 NK cells
	# 6 FCER1A, CST3 Dendritic Cells
	# 7 PPBP Megakaryocytes
	```