Created
March 6, 2020 13:20
-
-
Save klprint/a0776d7f92f8d29cc233606467eaafb5 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| library(hdf5r) | |
| library(Matrix) | |
| source("https://gist.githubusercontent.com/klprint/1ab4468eb3c54abcf0422dec6223b8fc/raw/b4cc33f5b4da25bcc2e678cf46b692fe67605460/single_cell_functions.R") | |
| library(SingleCellExperiment) | |
| library(tidyverse) | |
| sce.raw.pca <- function(sce, k = 100){ | |
| umi <- assay(sce, "umi") | |
| cat("Getting informative genes\n") | |
| info.genes <- get.info.genes(umi) | |
| cat("\tSetting info genes\n") | |
| umi <- umi[info.genes, ] | |
| cat("Normalization\n") | |
| nrm <- t(umi) / colSums(umi) | |
| cat("Running PCA\n") | |
| pca <- irlba::prcomp_irlba(nrm, n = k)$x | |
| rownames(pca) <- colnames(umi) | |
| reducedDim(sce, "raw_pca") <- pca | |
| return(sce) | |
| } | |
| sce.split.by.col <- function(sco, column, min.cells = 1, n.cores = 10){ | |
| lvls <- unique(colData(sco)[,column]) | |
| sco <- sco[,!is.na(colData(sco)[,column])] | |
| out <- pbmcapply::pbmclapply(lvls, function(x){ | |
| sel <- colData(sco)[,column] == x | |
| if(sum(sel, na.rm = T) <= min.cells) return(NA) | |
| sco[,sel] | |
| }, mc.cores = n.cores) | |
| names(out) <- lvls | |
| out <- out[!is.na(out)] | |
| return(out) | |
| } | |
| sce.run.liger <- function(sce.list, k = 25, assay.slot = "umi", plot.genes=F){ | |
| require(liger) | |
| umi <- lapply(sce.list, function(x) assay(x, assay.slot)) | |
| cat("Creating LIGER object\n") | |
| l = createLiger(umi) | |
| cat("Preprocessing LIGER\n") | |
| l = liger::normalize(l) | |
| l = liger::selectGenes(l, do.plot = plot.genes) | |
| l = liger::scaleNotCenter(l) | |
| cat("Running LIGER\n") | |
| l = optimizeALS(l, k = k) | |
| return(l) | |
| } | |
| run.umap <- function(X, n_components = 3, min_dist = .1, n_neighbors = 15){ | |
| set.seed(1234) | |
| uwot::umap( | |
| X, | |
| metric = "cosine", | |
| min_dist = min_dist, | |
| n_neighbors = n_neighbors, | |
| verbose = T, | |
| n_components = n_components | |
| ) | |
| } | |
| sce.run.umap <- function(sce, use.reduction = "liger", n_components = 3, min_dist = .1, n_neighbors = 15){ | |
| out <- run.umap(reducedDim(sce, use.reduction), n_components = n_components, min_dist = min_dist, n_neighbors = n_neighbors) | |
| rownames(out) <- rownames(reducedDim(sce, use.reduction)) | |
| reducedDim(sce, paste0("umap", n_components, "d")) <- out | |
| return(sce) | |
| } | |
| liger.to.sce <- function(liger.object, input.sce.list){ | |
| out <- SingleCellExperiment( | |
| assays = list(umi = do.call(merge.sparse, liger.object@raw.data)), | |
| colData = lapply(input.sce.list, colData) %>% do.call(rbind,.), | |
| reducedDims = list( | |
| liger = do.call(rbind, liger.object@H) | |
| ) | |
| ) | |
| return(out) | |
| } | |
| sce.pointcloud.cat <- function(sce, reduceDimName, categoryName){ | |
| emb <- reducedDim(sce, reduceDimName) | |
| ctg <- colData(sce)[,categoryName] | |
| ctg[is.na(ctg)] <- "ND" | |
| emb <- emb[order(ctg), ] | |
| ctg <- sort(ctg) | |
| babyplots::pointCloud(emb, "categories", ctg) | |
| } | |
| sce.pointcloud.gene <- function(sce, reduceDimName, gene, assay="umi"){ | |
| emb <- reducedDim(sce, reduceDimName) | |
| val <- log1p((assay(ct.comb["ENSG00000075891", ], assay) / colSums( assay(ct.comb, assay)))[1,]) | |
| babyplots::pointCloud(emb, "values", val) | |
| } | |
| sce.pointcloud.val <- function(sce, reduceDimName, val){ | |
| emb <- reducedDim(sce, reduceDimName) | |
| babyplots::pointCloud(emb, "values", val) | |
| } | |
| sce.pseudobulk <- function(sce, group.by, min_cells = 50, assay = "umi", mc.cores=10){ | |
| splitted <- sce.split.by.col(sce, group.by, min.cells = min_cells) | |
| #print(splitted) | |
| if(length(splitted) == 0){ | |
| return(NA) | |
| } | |
| pb <- pbmcapply::pbmclapply(splitted, function(x) rowSums(assay(x, assay)), mc.cores = mc.cores) %>% | |
| do.call(cbind, .) | |
| colnames(pb) <- names(splitted) | |
| return(pb) | |
| } | |
| sce.to.anndata <- function(sce, neighbors_from = "liger", n_neighbors = 30, n_pcs = ncol(reducedDim(sce, neighbors_from)), assay = "umi"){ | |
| cat("Creating AnnData Object\n") | |
| sc <- import("scanpy") | |
| ad <- sc$AnnData(X = t(assay(sce, assay)), | |
| obs = as.data.frame(colData(sce)), | |
| obsm = reducedDims(sce)@listData[!startsWith(reducedDimNames(sce), "X_")]) | |
| print(ad) | |
| sc$pp$neighbors(ad, n_neighbors = as.integer(n_neighbors), use_rep = neighbors_from, n_pcs = as.integer(n_pcs)) | |
| return(ad) | |
| } | |
| sce.diffmap <- function(sce, n_diff_comps = 5, neighbors_from = "liger", n_neighbors = 30, n_pcs = ncol(reducedDim(sce, neighbors_from)), assay = "umi"){ | |
| sc <- import("scanpy") | |
| ad <- sce.to.anndata(sce, | |
| n_neighbors = as.integer(n_neighbors), | |
| neighbors_from = neighbors_from, | |
| n_pcs = as.integer(n_pcs), | |
| assay = assay) | |
| cat("Running diffusion map\n") | |
| sc$tl$diffmap(ad, n_comps = as.integer(n_diff_comps)) | |
| sce@metadata$ad <- ad$copy() | |
| reducedDim(sce, "X_diffmap") <- ad$obsm["X_diffmap"] | |
| return(sce) | |
| } | |
| sce.dpt <- function(sce, root_cell_index, n_dcs = 5){ | |
| stopifnot("X_diffmap" %in% reducedDimNames(sce)) | |
| sc <- import("scanpy") | |
| ad <- sce@metadata$ad | |
| ad$uns["iroot"] <- as.integer(root_cell_index-1) | |
| cat("Running DPT\n") | |
| sc$tl$dpt(ad, n_dcs = as.integer(n_dcs)) | |
| sce$dpt <- ad$obs["dpt_pseudotime"][,1] | |
| sce$dpt_rank <- rank(sce$dpt, ties.method = "first") | |
| sce$dpt_rank <- sce$dpt_rank - min(sce$dpt_rank) | |
| sce$dpt_rank <- sce$dpt_rank / max(sce$dpt_rank) | |
| return(sce) | |
| } | |
| sce.plot.3d <- function(sce, val, redDim = "umap3d", width = NULL, height=NULL){ | |
| coord <- reducedDim(sce, redDim)[,1:3] | |
| if(val %in% colnames(colData(sce))){ | |
| value = colData(sce)[,val] | |
| }else if(val %in% rownames(sce)){ | |
| value = log1p(assay(sce, "umi")[val, ] / colSums(assay(sce, "umi"))) | |
| }else{ | |
| stop(paste0("Could not find \"", val, "\" in the object!")) | |
| } | |
| val = value | |
| if(is.numeric(val)){ | |
| val[is.na(val)] <- 0 | |
| babyplots::pointCloud( | |
| coord, | |
| "values", | |
| val, | |
| width = width, | |
| height = height | |
| ) | |
| }else if(is.factor(val) | is.character(val)){ | |
| val[is.na(val)] <- "NA" | |
| coord <- coord[order(val), ] | |
| val <- sort(val) | |
| babyplots::pointCloud( | |
| coord, | |
| "categories", | |
| val, | |
| width = width, | |
| height = height | |
| ) | |
| }else{ | |
| stop("val is neither numeric, nor a character or factor!") | |
| } | |
| } | |
| sce.plot.2d <- function(sce, val, redDim = "umap2d", comps = 1:2, point_size=.1, with_repel = F, repel_size=3, do.sort=F, force.discrete = F){ | |
| coord <- reducedDim(sce, redDim)[,comps] | |
| val.orig <- val | |
| has.rowdata <- ifelse(!is.null(ncol(rowData(sce))), | |
| ifelse("gene_name" %in% colnames(rowData(sce)), | |
| ifelse(val %in% rowData(sce)[,"gene_name"], | |
| T, | |
| F), | |
| F), | |
| F) | |
| if(length(val) == ncol(sce)){ | |
| value = val | |
| }else if(val %in% colnames(colData(sce))){ | |
| value = colData(sce)[,val] | |
| }else if(val %in% rownames(sce)){ | |
| value = log1p(assay(sce, "umi")[val, ] / colSums(assay(sce, "umi"))) | |
| }else if( has.rowdata ){ | |
| val <- rownames(rowData(sce))[which(rowData(sce)[,"gene_name"] == val)] | |
| print(val) | |
| value = log1p(assay(sce, "umi")[val, ] / colSums(assay(sce, "umi"))) | |
| }else{ | |
| stop(paste0("Could not find \"", val, "\" in the object!")) | |
| } | |
| if(force.discrete) value <- factor(as.character(value), levels = as.character(sort(unique(value)))) | |
| dat <- tibble( | |
| cell_id = rownames(coord), | |
| x = coord[,1], | |
| y = coord[,2] | |
| ) %>% | |
| left_join( | |
| tibble( | |
| cell_id = rownames(colData(sce)), | |
| val = value | |
| ) | |
| ) | |
| if(is.numeric(value)){ | |
| if(do.sort){ | |
| dat <- dat %>% arrange(val) | |
| } | |
| p <- dat %>% | |
| ggplot(aes(x,y,color = val)) + | |
| geom_point(size=point_size) | |
| p <- p + | |
| scale_color_continuous(name = val.orig, low="gray", high="red") | |
| }else{ | |
| p <- dat %>% | |
| ggplot(aes(x,y,color = val)) + | |
| geom_point(size=point_size) | |
| p <- p + | |
| scale_color_discrete(name=val.orig) + | |
| guides(color = guide_legend(override.aes = aes(size = 2))) | |
| } | |
| if(with_repel){ | |
| p <- p + ggrepel::geom_label_repel( | |
| data = function(x){ | |
| x %>% group_by(val) %>% | |
| summarise(x = median(x), | |
| y = median(y)) | |
| }, | |
| mapping = aes(label = val), size = repel_size | |
| ) + | |
| theme(legend.position = "none") | |
| } | |
| return(p) | |
| } | |
| sce.run.liger.full = function(sce, min.cells = 50, k=25, split.by = "batch"){ | |
| if(!is.na(ncol(rowData(sce)))) row.data = rowData(sce) | |
| cat("Splitting\n") | |
| data.list = sce.split.by.col(sce, split.by, min.cells = min.cells) | |
| cat("Running liger\n") | |
| data.liger = sce.run.liger(data.list,k=k) | |
| cat("Combining to SCE object\n") | |
| out = liger.to.sce(data.liger, data.list) | |
| tmp.rd = matrix(NA, ncol=k, nrow=nrow(out), dimnames = list(rownames(out), paste0("liger_", 1:k))) | |
| tmp.rd[colnames(data.liger@W), ] = t(data.liger@W) | |
| tmp.rd = as.data.frame(tmp.rd) | |
| if(!is.na(ncol(rowData(sce)))){ | |
| rowData(out) <- tmp.rd %>% | |
| rownames_to_column("gene_id") %>% | |
| left_join( | |
| as.data.frame(row.data) %>% | |
| rownames_to_column("gene_id") | |
| ) %>% | |
| as.data.frame() %>% | |
| column_to_rownames("gene_id") | |
| }else{ | |
| rowData(out) = tmp.rd | |
| } | |
| cat("UMAP") | |
| out = sce.run.umap(out, n_components = 2) | |
| return(out) | |
| } | |
| sce.liger.pheatmap = function(sce, order.col.by = "dpt", annot.cols = NULL){ | |
| annot = NULL | |
| if(!is.null(annot.cols)){ | |
| annot = as.data.frame(colData(sce)[,annot.cols]) | |
| rownames(annot) <- colnames(sce) | |
| colnames(annot) <- annot.cols | |
| } | |
| print(head(annot)) | |
| ph.data = t(reducedDim(sce, "liger")[order(colData(sce)[,order.col.by]), ]) | |
| rownames(ph.data) = paste0("liger_", 1:nrow(ph.data)) | |
| print(head(ph.data)[,1:5]) | |
| pheatmap::pheatmap( | |
| ph.data, | |
| show_rownames=T, | |
| show_colnames=F, | |
| cluster_cols=F, | |
| clustering_distance_rows="correlation", | |
| color=colorRampPalette(c("white", "black"))(100), | |
| annotation_col = annot | |
| ) | |
| } | |
| sce.subsample.per.group <- function(sce, group.by, n.cells=1000, min.cells = 25, allow.min = F, | |
| mc.ncores = 10){ | |
| sce.list <- sce.split.by.col(sce, column = group.by, min.cells = min.cells, n.cores = mc.ncores) | |
| if(!allow.min) sce.list <- sce.list[lapply(sce.list, ncol) %>% do.call("c", .) >= 1000] | |
| sce.list <- lapply(sce.list, function(x){ | |
| if(ncol(x) > n.cells){ | |
| x <- x[,sample(colnames(x), n.cells, replace = F)] | |
| } | |
| return(x) | |
| }) | |
| out <- do.call(cbind, sce.list) | |
| return(out) | |
| } | |
| sce.gene.enrichment <- function(sce, | |
| in.group, | |
| label, | |
| balance.group = NULL, | |
| do.balance = F, | |
| subsample.to = 500, | |
| allow.min.cells = F, | |
| min.cells = 25, | |
| mc.cores=10, | |
| use.hvg = T, | |
| frac.min=.25){ | |
| ## Subsampling, if wanted | |
| if(do.balance){ | |
| sce <- sce.subsample.per.group(sce, | |
| group.by = in.group, | |
| n.cells = subsample.to, | |
| min.cells = min.cells, | |
| allow.min = allow.min.cells, | |
| mc.ncores = mc.cores) | |
| } | |
| ## | |
| umi <- assay(sce, "umi") | |
| sf <- colSums(umi) | |
| if(use.hvg) umi <- umi[get.info.genes(umi), ] | |
| group.labels <- colData(sce)[,in.group] | |
| are.ingroup <- group.labels %in% label | |
| test.genes <- rowSums( umi[,are.ingroup] > 0 ) > floor(frac.min * sum(are.ingroup)) | |
| umi <- umi[test.genes, ] | |
| out <- pbmcapply::pbmclapply(rownames(umi), function(x){ | |
| tibble( | |
| gene_id = x, | |
| log2fc = log2(mean( log1p(umi[x,are.ingroup] / sf[are.ingroup])) / mean(log1p(umi[x,!are.ingroup] / sf[!are.ingroup]))), | |
| p.value = wilcox.test(x = log1p(umi[x,are.ingroup] / sf[are.ingroup]), y = log1p(umi[x,!are.ingroup] / sf[!are.ingroup]), alternative = "greater" )$p.value | |
| ) | |
| }, mc.cores = mc.cores) %>% | |
| do.call(rbind, .) | |
| out$p.adjust = p.adjust(out$p.value, method = "BH") | |
| if(!is.null(ncol(rowData(sce)))){ | |
| if("gene_name" %in% colnames(rowData(sce))){ | |
| out <- out %>% left_join( | |
| as.data.frame(rowData(sce)) %>% | |
| rownames_to_column("gene_id") %>% | |
| select("gene_id", "gene_name") | |
| ) | |
| } | |
| } | |
| return(out %>% arrange(desc(log2fc))) | |
| } | |
| sce.all.markers <- function(sce, | |
| in.group, | |
| balance.group = NULL, | |
| do.balance = F, | |
| subsample.to = 500, | |
| allow.min.cells = F, | |
| min.cells = 25, | |
| mc.cores=10, | |
| use.hvg = T, | |
| frac.min=.25){ | |
| lvls <- unique(colData(sce)[,in.group]) | |
| out <- lapply(lvls, function(i){ | |
| cat("Finding genes for ", i, "\n") | |
| sce.gene.enrichment(sce = sce, | |
| in.group = in.group, | |
| label = i, | |
| balance.group = balance.group, | |
| do.balance = do.balance, | |
| subsample.to = subsample.to, | |
| allow.min.cells = allow.min.cells, | |
| min.cells = min.cells, | |
| mc.cores=mc.cores, | |
| use.hvg = use.hvg, | |
| frac.min=frac.min) | |
| }) | |
| names(out) <- lvls | |
| return(out) | |
| } | |
| sce.plot.along <- function(sce, val, along = "stage.ord", as.boxplot = F, as.violin = F, | |
| pointsize = .1, color=NULL){ | |
| val.orig <- val | |
| has.rowdata <- ifelse(!is.null(ncol(rowData(sce))), | |
| ifelse("gene_name" %in% colnames(rowData(sce)), | |
| ifelse(val %in% rowData(sce)[,"gene_name"], | |
| T, | |
| F), | |
| F), | |
| F) | |
| if(length(val) == ncol(sce)){ | |
| value = val | |
| }else if(val %in% rownames(sce)){ | |
| value = log1p(assay(sce, "umi")[val, ] / colSums(assay(sce, "umi"))) | |
| }else if( has.rowdata ){ | |
| val <- rownames(rowData(sce))[which(rowData(sce)[,"gene_name"] == val)] | |
| print(val) | |
| value = log1p(assay(sce, "umi")[val, ] / colSums(assay(sce, "umi"))) | |
| }else{ | |
| stop(paste0("Could not find \"", val, "\" in the object!")) | |
| } | |
| if(!is.null(color)){ | |
| df <- tibble( | |
| cell_id = colnames(sce), | |
| exprs = value | |
| ) %>% | |
| left_join( | |
| as.data.frame(colData(sce)) %>% | |
| rownames_to_column("cell_id") | |
| ) %>% | |
| select(exprs, along, color) %>% | |
| dplyr::rename(x = along, | |
| group = color) | |
| }else{ | |
| df <- tibble( | |
| cell_id = colnames(sce), | |
| exprs = value | |
| ) %>% | |
| left_join( | |
| as.data.frame(colData(sce)) %>% | |
| rownames_to_column("cell_id") | |
| ) %>% | |
| select(exprs, along, color) %>% | |
| dplyr::rename(x = along) | |
| } | |
| plt <- df %>% | |
| ggplot(aes(x=x, y=exprs)) + | |
| xlab(along) | |
| if(as.boxplot){ | |
| plt <- plt + geom_boxplot() | |
| }else if(as.violin){ | |
| plt <- plt + geom_violin() | |
| }else if(!is.null(color)){ | |
| plt <- plt + geom_point(aes(color=group), | |
| size=pointsize) | |
| }else{ | |
| plt <- plt + geom_point(size=pointsize) | |
| } | |
| if(!is.numeric(colData(sce)[,along])){ | |
| plt <- plt+ | |
| theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust=.5)) | |
| } | |
| return(plt) | |
| } | |
| sce.get.gene.exprs <- function(sce, gene, umi=F){ | |
| out <- NULL | |
| gene.orig <- gene | |
| if(!is.null(ncol(rowData(sce)))){ | |
| if("gene_name" %in% colnames(rowData(sce))){ | |
| if(gene %in% rowData(sce)[,"gene_name"]){ | |
| gene <- rownames(rowData(sce))[which(rowData(sce)[,"gene_name"] == gene)] | |
| } | |
| } | |
| } | |
| if(gene %in% rownames(sce)){ | |
| if(!umi){ | |
| out <- log1p(assay(sce, "umi")[gene,] / colSums(assay(sce, "umi"))) | |
| }else{ | |
| out <- assay(sce, "umi")[gene,] | |
| } | |
| } | |
| return(out) | |
| } | |
| sce.dotplot <- function(sce, genes, group.by = "cell_type"){ | |
| tmp <- lapply(genes, function(g){ | |
| exprs <- sce.get.gene.exprs(sce, g, umi = T) | |
| if(is.null(exprs)){ | |
| return(NULL) | |
| }else{ | |
| colData(sce) %>% | |
| as_tibble() %>% | |
| add_column(umi = exprs) %>% | |
| mutate(exprs = umi / colSums(assay(sce, "umi"))) %>% | |
| mutate(expresses = ifelse(umi > 0 , 1, 0)) %>% | |
| dplyr::rename(gr = group.by) %>% | |
| group_by(gr) %>% | |
| summarise(exprs = mean(exprs), | |
| expresses = sum(expresses) / n()) %>% | |
| add_column(gene = g) %>% | |
| mutate(exprs = ((exprs - min(exprs)) / max(exprs - min(exprs)))) | |
| } | |
| }) %>% do.call(rbind, .) | |
| if(is.null(tmp)) stop("No genes found!\n") | |
| tmp %>% | |
| ggplot(aes(x=gene, y=gr)) + | |
| geom_point(aes(color=exprs, size=expresses)) + | |
| theme(axis.text.x = element_text(angle=90, hjust=1,vjust=.5)) + | |
| scale_color_continuous(name = "scaled mean expr.", low="gray", high="red") + | |
| scale_size_continuous(name="fraction pos.") + | |
| ylab(group.by) | |
| } | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment