Created
September 28, 2020 13:21
-
-
Save klprint/47f2a95974da472628dc0a5aa866f611 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(hdf5r) | |
library(Matrix) | |
source("https://gist.githubusercontent.com/klprint/1ab4468eb3c54abcf0422dec6223b8fc/raw/b4cc33f5b4da25bcc2e678cf46b692fe67605460/single_cell_functions.R") | |
library(SingleCellExperiment) | |
library(tidyverse) | |
library(liger) | |
sce.raw.pca <- function(sce, k = 100){ | |
umi <- assay(sce, "umi") | |
cat("Getting informative genes\n") | |
info.genes <- get.info.genes(umi) | |
cat("\tSetting info genes\n") | |
umi <- umi[info.genes, ] | |
cat("Normalization\n") | |
nrm <- t(umi) / colSums(umi) | |
cat("Running PCA\n") | |
pca <- irlba::prcomp_irlba(nrm, n = k)$x | |
rownames(pca) <- colnames(umi) | |
reducedDim(sce, "raw_pca") <- pca | |
return(sce) | |
} | |
sce.split.by.col <- function(sco, column, min.cells = 1, n.cores = 10){ | |
lvls <- unique(colData(sco)[,column]) | |
sco <- sco[,!is.na(colData(sco)[,column])] | |
out <- pbmcapply::pbmclapply(lvls, function(x){ | |
sel <- colData(sco)[,column] == x | |
if(sum(sel, na.rm = T) <= min.cells) return(NA) | |
sco[,sel] | |
}, mc.cores = n.cores) | |
names(out) <- lvls | |
out <- out[!is.na(out)] | |
return(out) | |
} | |
sce.run.liger <- function(sce.list, k = 25, assay.slot = "umi", plot.genes=F){ | |
require(liger) | |
umi <- lapply(sce.list, function(x) assay(x, assay.slot)) | |
cat("Creating LIGER object\n") | |
l = createLiger(umi) | |
cat("Preprocessing LIGER\n") | |
l = liger::normalize(l) | |
l = liger::selectGenes(l, do.plot = plot.genes) | |
l = liger::scaleNotCenter(l) | |
cat("Running LIGER\n") | |
l = optimizeALS(l, k = k) | |
return(l) | |
} | |
run.umap <- function(X, n_components = 3, min_dist = .1, n_neighbors = 15, ret_nn = F){ | |
set.seed(1234) | |
uwot::umap( | |
X, | |
metric = "cosine", | |
min_dist = min_dist, | |
n_neighbors = n_neighbors, | |
verbose = T, | |
n_components = n_components, | |
ret_nn = ret_nn | |
) | |
} | |
sce.run.umap <- function(sce, use.reduction = "liger", n_components = 3, min_dist = .1, n_neighbors = 15){ | |
out <- run.umap(reducedDim(sce, use.reduction), n_components = n_components, min_dist = min_dist, n_neighbors = n_neighbors) | |
rownames(out) <- rownames(reducedDim(sce, use.reduction)) | |
reducedDim(sce, paste0("umap", n_components, "d")) <- out | |
return(sce) | |
} | |
liger.to.sce <- function(liger.object, input.sce.list){ | |
out <- SingleCellExperiment( | |
assays = list(umi = do.call(merge.sparse, [email protected])), | |
colData = lapply(input.sce.list, colData) %>% do.call(rbind,.), | |
reducedDims = list( | |
liger = do.call(rbind, liger.object@H) | |
) | |
) | |
return(out) | |
} | |
sce.pointcloud.cat <- function(sce, reduceDimName, categoryName){ | |
emb <- reducedDim(sce, reduceDimName) | |
ctg <- colData(sce)[,categoryName] | |
ctg[is.na(ctg)] <- "ND" | |
emb <- emb[order(ctg), ] | |
ctg <- sort(ctg) | |
babyplots::pointCloud(emb, "categories", ctg) | |
} | |
sce.pointcloud.gene <- function(sce, reduceDimName, gene, assay="umi"){ | |
emb <- reducedDim(sce, reduceDimName) | |
val <- log1p((assay(ct.comb["ENSG00000075891", ], assay) / colSums( assay(ct.comb, assay)))[1,]) | |
babyplots::pointCloud(emb, "values", val) | |
} | |
sce.pointcloud.val <- function(sce, reduceDimName, val){ | |
emb <- reducedDim(sce, reduceDimName) | |
babyplots::pointCloud(emb, "values", val) | |
} | |
sce.pseudobulk <- function(sce, group.by, min_cells = 50, assay = "umi", mc.cores=10, subsample.to = NULL){ | |
splitted <- sce.split.by.col(sce, group.by, min.cells = min_cells) | |
#print(splitted) | |
if(length(splitted) == 0){ | |
return(NA) | |
} | |
pb <- pbmcapply::pbmclapply(splitted, function(x) { | |
if(!is.null(subsample.to)){ | |
if(ncol(x) > subsample.to){ | |
set.seed(1234) | |
x <- x[,sample(colnames(x), subsample.to, replace = F)] | |
} | |
} | |
rowSums(assay(x, assay)) | |
}, mc.cores = mc.cores) %>% | |
do.call(cbind, .) | |
colnames(pb) <- names(splitted) | |
return(pb) | |
} | |
sce.pseudobulk.center <- function(sce, group.by, min_cells = 50, assay = "umi", mc.cores=10, subsample.to = NULL){ | |
splitted <- sce.split.by.col(sce, group.by, min.cells = min_cells) | |
#print(splitted) | |
if(length(splitted) == 0){ | |
return(NA) | |
} | |
pb <- pbmcapply::pbmclapply(splitted, function(x) { | |
if(!is.null(subsample.to)){ | |
if(ncol(x) > subsample.to){ | |
set.seed(1234) | |
x <- x[,sample(colnames(x), subsample.to, replace = F)] | |
} | |
} | |
rowMeans(assay(x, assay)) | |
}, mc.cores = mc.cores) %>% | |
do.call(cbind, .) | |
colnames(pb) <- names(splitted) | |
return(pb) | |
} | |
sce.to.anndata <- function(sce, neighbors_from = "liger", n_neighbors = 30, n_pcs = ncol(reducedDim(sce, neighbors_from)), assay = "umi"){ | |
cat("Creating AnnData Object\n") | |
sc <- import("scanpy") | |
ad <- sc$AnnData(X = t(assay(sce, assay)), | |
obs = as.data.frame(colData(sce)), | |
obsm = reducedDims(sce)@listData[!startsWith(reducedDimNames(sce), "X_")]) | |
print(ad) | |
sc$pp$neighbors(ad, n_neighbors = as.integer(n_neighbors), use_rep = neighbors_from, n_pcs = as.integer(n_pcs)) | |
return(ad) | |
} | |
sce.diffmap <- function(sce, n_diff_comps = 5, neighbors_from = "liger", n_neighbors = 30, n_pcs = ncol(reducedDim(sce, neighbors_from)), assay = "umi"){ | |
sc <- import("scanpy") | |
ad <- sce.to.anndata(sce, | |
n_neighbors = as.integer(n_neighbors), | |
neighbors_from = neighbors_from, | |
n_pcs = as.integer(n_pcs), | |
assay = assay) | |
cat("Running diffusion map\n") | |
sc$tl$diffmap(ad, n_comps = as.integer(n_diff_comps)) | |
sce@metadata$ad <- ad$copy() | |
reducedDim(sce, "X_diffmap") <- ad$obsm["X_diffmap"] | |
return(sce) | |
} | |
sce.dpt <- function(sce, root_cell_index, n_dcs = 5){ | |
stopifnot("X_diffmap" %in% reducedDimNames(sce)) | |
sc <- import("scanpy") | |
ad <- sce@metadata$ad | |
ad$uns["iroot"] <- as.integer(root_cell_index-1) | |
cat("Running DPT\n") | |
sc$tl$dpt(ad, n_dcs = as.integer(n_dcs)) | |
sce$dpt <- ad$obs["dpt_pseudotime"][,1] | |
sce$dpt_rank <- rank(sce$dpt, ties.method = "first") | |
sce$dpt_rank <- sce$dpt_rank - min(sce$dpt_rank) | |
sce$dpt_rank <- sce$dpt_rank / max(sce$dpt_rank) | |
return(sce) | |
} | |
sce.plot.3d <- function(sce, val, redDim = "umap3d", width = NULL, height=NULL){ | |
coord <- reducedDim(sce, redDim)[,1:3] | |
if(val %in% colnames(colData(sce))){ | |
value = colData(sce)[,val] | |
}else if(val %in% rownames(sce)){ | |
value = log1p(assay(sce, "umi")[val, ] / colSums(assay(sce, "umi"))) | |
}else{ | |
stop(paste0("Could not find \"", val, "\" in the object!")) | |
} | |
val = value | |
if(is.numeric(val)){ | |
val[is.na(val)] <- 0 | |
babyplots::pointCloud( | |
coord, | |
"values", | |
val, | |
width = width, | |
height = height | |
) | |
}else if(is.factor(val) | is.character(val)){ | |
val[is.na(val)] <- "NA" | |
coord <- coord[order(val), ] | |
val <- sort(val) | |
babyplots::pointCloud( | |
coord, | |
"categories", | |
val, | |
width = width, | |
height = height | |
) | |
}else{ | |
stop("val is neither numeric, nor a character or factor!") | |
} | |
} | |
sce.plot.2d <- function(sce, | |
val, | |
redDim = "umap2d", | |
comps = 1:2, | |
point_size=.1, | |
alpha = 1, | |
with_repel = F, | |
repel_size=3, | |
do.sort=F, | |
force.discrete = F, | |
color.palette.vector = c("gray", rev(viridis::inferno(100))), | |
color.cubehelix = F, | |
color.inferno = F){ | |
coord <- reducedDim(sce, redDim)[,comps] | |
if(color.cubehelix){ | |
color.palette.vector <- rev(rje::cubeHelix(100)) | |
} | |
if(color.inferno){ | |
color.palette.vector <- viridis::inferno(100) | |
} | |
val.orig <- val | |
has.rowdata <- ifelse(!is.null(ncol(rowData(sce))), | |
ifelse("gene_name" %in% colnames(rowData(sce)), | |
ifelse(val %in% rowData(sce)[,"gene_name"], | |
T, | |
F), | |
F), | |
F) | |
if(length(val) == ncol(sce)){ | |
value = val | |
}else if(val %in% colnames(colData(sce))){ | |
value = colData(sce)[,val] | |
}else if(val %in% rownames(sce)){ | |
value = sqrt(assay(sce, "umi")[val, ] / colSums(assay(sce, "umi"))) | |
}else if( has.rowdata ){ | |
val <- rownames(rowData(sce))[which(rowData(sce)[,"gene_name"] == val)] | |
print(val) | |
value = sqrt(assay(sce, "umi")[val, ] / colSums(assay(sce, "umi"))) | |
}else{ | |
stop(paste0("Could not find \"", val, "\" in the object!")) | |
} | |
if(force.discrete) value <- factor(as.character(value), levels = as.character(sort(unique(value)))) | |
dat <- tibble( | |
cell_id = rownames(coord), | |
x = coord[,1], | |
y = coord[,2] | |
) %>% | |
left_join( | |
tibble( | |
cell_id = rownames(colData(sce)), | |
val = value | |
), | |
by="cell_id" | |
) %>% ungroup() %>% sample_n(n()) | |
if(is.numeric(value)){ | |
if(do.sort){ | |
dat <- dat %>% arrange(val) | |
} | |
p <- dat %>% | |
ggplot(aes(x,y,color = val)) + | |
geom_point(size=point_size, alpha = alpha) | |
if(!is.null(color.palette.vector)){ | |
p <- p + | |
scale_color_gradientn(name=val.orig, colors = color.palette.vector) | |
}else{ | |
p <- p + | |
scale_color_continuous(name = val.orig, low="gray", high="red") | |
} | |
}else{ | |
p <- dat %>% | |
ggplot(aes(x,y,color = val)) + | |
geom_point(size=point_size) | |
# p <- p + | |
# scale_color_discrete(name=val.orig) + | |
# guides(color = guide_legend(override.aes = aes(size = 2))) | |
# | |
if(!is.null(color.palette.vector)){ | |
p <- p + scale_color_manual(name=val.orig, | |
values = color.palette.vector, na.value="#808080") + | |
guides(color = guide_legend(override.aes = aes(size = 2))) | |
}else{ | |
p <- p + | |
scale_color_discrete(name=val.orig) + | |
guides(color = guide_legend(override.aes = aes(size = 2))) | |
} | |
} | |
if(with_repel){ | |
p <- p + ggrepel::geom_label_repel( | |
data = function(x){ | |
x %>% group_by(val) %>% | |
summarise(x = median(x), | |
y = median(y)) | |
}, | |
mapping = aes(label = val), size = repel_size | |
) + | |
theme(legend.position = "none") | |
} | |
return(p) | |
} | |
sce.run.liger.full = function(sce, min.cells = 50, k=25, split.by = "batch", n.cores=10){ | |
if(!is.na(ncol(rowData(sce)))) row.data = rowData(sce) | |
cat("Splitting\n") | |
data.list = sce.split.by.col(sce, split.by, min.cells = min.cells, n.cores = n.cores) | |
cat("Running liger\n") | |
data.liger = sce.run.liger(data.list,k=k) | |
cat("Combining to SCE object\n") | |
out = liger.to.sce(data.liger, data.list) | |
tmp.rd = matrix(NA, ncol=k, nrow=nrow(out), dimnames = list(rownames(out), paste0("liger_", 1:k))) | |
tmp.rd[colnames(data.liger@W), ] = t(data.liger@W) | |
tmp.rd = as.data.frame(tmp.rd) | |
if(!is.na(ncol(rowData(sce)))){ | |
rowData(out) <- tmp.rd %>% | |
rownames_to_column("gene_id") %>% | |
left_join( | |
as.data.frame(row.data) %>% | |
rownames_to_column("gene_id") | |
) %>% | |
as.data.frame() %>% | |
column_to_rownames("gene_id") | |
}else{ | |
rowData(out) = tmp.rd | |
} | |
cat("UMAP") | |
out = sce.run.umap(out, n_components = 2) | |
return(out) | |
} | |
sce.liger.pheatmap = function(sce, order.col.by = "dpt", annot.cols = NULL){ | |
annot = NULL | |
if(!is.null(annot.cols)){ | |
annot = as.data.frame(colData(sce)[,annot.cols]) | |
rownames(annot) <- colnames(sce) | |
colnames(annot) <- annot.cols | |
} | |
print(head(annot)) | |
ph.data = t(reducedDim(sce, "liger")[order(colData(sce)[,order.col.by]), ]) | |
rownames(ph.data) = paste0("liger_", 1:nrow(ph.data)) | |
print(head(ph.data)[,1:5]) | |
pheatmap::pheatmap( | |
ph.data, | |
show_rownames=T, | |
show_colnames=F, | |
cluster_cols=F, | |
clustering_distance_rows="correlation", | |
color=colorRampPalette(c("white", "black"))(100), | |
annotation_col = annot | |
) | |
} | |
sce.subsample.per.group <- function(sce, group.by, n.cells=1000, min.cells = 25, allow.min = F, | |
mc.ncores = 10){ | |
sce.list <- sce.split.by.col(sce, column = group.by, min.cells = min.cells, n.cores = mc.ncores) | |
if(!allow.min) sce.list <- sce.list[lapply(sce.list, ncol) %>% do.call("c", .) >= n.cells] | |
sce.list <- lapply(sce.list, function(x){ | |
if(ncol(x) > n.cells){ | |
set.seed(1234) | |
x <- x[,sample(colnames(x), n.cells, replace = F)] | |
} | |
return(x) | |
}) | |
out <- do.call(cbind, sce.list) | |
return(out) | |
} | |
sce.gene.enrichment <- function(sce, | |
in.group, | |
label, | |
balance.group = NULL, | |
do.balance = F, | |
subsample.to = 500, | |
allow.min.cells = F, | |
min.cells = 25, | |
mc.cores=10, | |
use.hvg = T, | |
frac.min=.25){ | |
## Subsampling, if wanted | |
if(do.balance){ | |
sce <- sce.subsample.per.group(sce, | |
group.by = in.group, | |
n.cells = subsample.to, | |
min.cells = min.cells, | |
allow.min = allow.min.cells, | |
mc.ncores = mc.cores) | |
} | |
## | |
umi <- assay(sce, "umi") | |
sf <- colSums(umi) | |
if(use.hvg) umi <- umi[get.info.genes(umi), ] | |
group.labels <- colData(sce)[,in.group] | |
are.ingroup <- group.labels %in% label | |
test.genes <- rowSums( umi[,are.ingroup, drop=F] > 0 ) > floor(frac.min * sum(are.ingroup)) | |
umi <- umi[test.genes, ] | |
out <- pbmcapply::pbmclapply(rownames(umi), function(x){ | |
tibble( | |
gene_id = x, | |
log2fc = log2(mean( log1p(umi[x,are.ingroup] / sf[are.ingroup])) / mean(log1p(umi[x,!are.ingroup] / sf[!are.ingroup]))), | |
p.value = wilcox.test(x = log1p(umi[x,are.ingroup] / sf[are.ingroup]), y = log1p(umi[x,!are.ingroup] / sf[!are.ingroup]), alternative = "greater" )$p.value | |
) | |
}, mc.cores = mc.cores) %>% | |
do.call(rbind, .) | |
out$p.adjust = p.adjust(out$p.value, method = "BH") | |
if(!is.null(ncol(rowData(sce)))){ | |
if("gene_name" %in% colnames(rowData(sce))){ | |
out <- out %>% left_join( | |
as.data.frame(rowData(sce)) %>% | |
rownames_to_column("gene_id") %>% | |
select("gene_id", "gene_name") | |
) | |
} | |
} | |
return(out %>% arrange(desc(log2fc))) | |
} | |
sce.all.markers <- function(sce, | |
in.group, | |
balance.group = NULL, | |
do.balance = F, | |
subsample.to = 500, | |
allow.min.cells = F, | |
min.cells = 25, | |
mc.cores=10, | |
use.hvg = T, | |
frac.min=.25){ | |
lvls <- unique(colData(sce)[,in.group]) | |
out <- lapply(lvls, function(i){ | |
cat("Finding genes for ", i, "\n") | |
sce.gene.enrichment(sce = sce, | |
in.group = in.group, | |
label = i, | |
balance.group = balance.group, | |
do.balance = do.balance, | |
subsample.to = subsample.to, | |
allow.min.cells = allow.min.cells, | |
min.cells = min.cells, | |
mc.cores=mc.cores, | |
use.hvg = use.hvg, | |
frac.min=frac.min) | |
}) | |
names(out) <- lvls | |
return(out) | |
} | |
sce.plot.along <- function(sce, val, along = "stage.ord", as.boxplot = F, as.violin = F, | |
pointsize = .1, color=NULL){ | |
val.orig <- val | |
has.rowdata <- ifelse(!is.null(ncol(rowData(sce))), | |
ifelse("gene_name" %in% colnames(rowData(sce)), | |
ifelse(val %in% rowData(sce)[,"gene_name"], | |
T, | |
F), | |
F), | |
F) | |
if(length(val) == ncol(sce)){ | |
value = val | |
}else if(val %in% rownames(sce)){ | |
value = log1p(assay(sce, "umi")[val, ] / colSums(assay(sce, "umi"))) | |
}else if( has.rowdata ){ | |
val <- rownames(rowData(sce))[which(rowData(sce)[,"gene_name"] == val)] | |
print(val) | |
value = log1p(assay(sce, "umi")[val, ] / colSums(assay(sce, "umi"))) | |
}else{ | |
stop(paste0("Could not find \"", val, "\" in the object!")) | |
} | |
if(!is.null(color)){ | |
df <- tibble( | |
cell_id = colnames(sce), | |
exprs = value | |
) %>% | |
left_join( | |
as.data.frame(colData(sce)) %>% | |
rownames_to_column("cell_id") | |
) %>% | |
select(exprs, along, color) %>% | |
dplyr::rename(x = along, | |
group = color) | |
}else{ | |
df <- tibble( | |
cell_id = colnames(sce), | |
exprs = value | |
) %>% | |
left_join( | |
as.data.frame(colData(sce)) %>% | |
rownames_to_column("cell_id") | |
) %>% | |
select(exprs, along, color) %>% | |
dplyr::rename(x = along) | |
} | |
plt <- df %>% | |
ggplot(aes(x=x, y=exprs)) + | |
xlab(along) | |
if(as.boxplot){ | |
plt <- plt + geom_boxplot() | |
}else if(as.violin){ | |
plt <- plt + geom_violin() | |
}else if(!is.null(color)){ | |
plt <- plt + geom_point(aes(color=group), | |
size=pointsize) | |
}else{ | |
plt <- plt + geom_point(size=pointsize) | |
} | |
if(!is.numeric(colData(sce)[,along])){ | |
plt <- plt+ | |
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust=.5)) | |
} | |
return(plt) | |
} | |
sce.get.gene.exprs <- function(sce, gene, umi=F){ | |
out <- NULL | |
gene.orig <- gene | |
if(!is.null(ncol(rowData(sce)))){ | |
if("gene_name" %in% colnames(rowData(sce))){ | |
if(gene %in% rowData(sce)[,"gene_name"]){ | |
gene <- rownames(rowData(sce))[which(rowData(sce)[,"gene_name"] == gene)] | |
} | |
} | |
} | |
if(gene %in% rownames(sce)){ | |
if(!umi){ | |
out <- sqrt(assay(sce, "umi")[gene,] / colSums(assay(sce, "umi"))) | |
#out <- scale(sqrt(assay(sce, "umi")[gene,] / colSums(assay(sce, "umi"))))[,1] | |
}else{ | |
out <- assay(sce, "umi")[gene,] | |
} | |
} | |
return(out) | |
} | |
sce.get.exprs.df <- function(sce, genes){ | |
lapply(genes, function(g){ | |
exprs <- sce.get.gene.exprs(sce, g, umi = T) | |
if(is.null(exprs)){ | |
return(NULL) | |
}else{ | |
sce.colData(sce) %>% | |
as_tibble() %>% | |
left_join( | |
tibble( | |
cell_id = colnames(sce), | |
size_factor = colSums(assay(sce, "umi")) | |
) | |
) %>% | |
left_join( | |
tibble( | |
cell_id = names(exprs), | |
umi = exprs, | |
gene = g | |
) | |
) | |
} | |
}) %>% do.call(rbind, .) | |
} | |
# Plotting expression polished: | |
sce.plot.polished.exprs <- function(sce, | |
gn, | |
reducedDim_name = "umap2d", | |
color = "cell_type", | |
dims = 1:2, | |
dim_names = "UMAP", | |
color_vec = c("gray", rev(viridis::inferno(100)))){ | |
ex = sce.get.exprs.df(sce, gn) %>% | |
mutate(exprs = log1p(umi / size_factor * 1e6)) | |
rd <- reducedDim(sce, reducedDim_name)[,dims] %>% | |
as.data.frame() %>% | |
rownames_to_column("cell_id") %>% | |
as_tibble() | |
colnames(rd) <- c("cell_id", "d1", "d2") | |
colnames(ex)[colnames(ex) == color] <- "group" | |
ex %>% | |
left_join(rd, by="cell_id") %>% | |
ungroup() %>% | |
sample_n(n()) %>% | |
filter(group != "NA") %>% | |
ggplot(aes(d1, d2)) + | |
geom_point(aes(color=exprs),size=.01, alpha=.5) + | |
#scale_color_viridis_c("expression", option="A") + | |
scale_color_gradientn(colors = color_vec) + | |
#new_scale_color() + | |
ggrepel::geom_label_repel( | |
data = function(x){ | |
x %>% | |
group_by(group) %>% | |
mutate(prop = sum(umi>0) / n()) %>% | |
filter(umi > 0) %>% | |
#filter(n() >= 10) %>% | |
summarise(d1 = median(d1), | |
d2 = median(d2), | |
prop = unique(prop)) %>% | |
ungroup() %>% | |
top_n(3, prop) %>% | |
filter(prop >= .01) %>% | |
mutate(prop = round(prop, 2)) | |
}, | |
mapping=aes(size = prop, | |
label = group), | |
#size=3, | |
force = 5, | |
nudge_x = 2, | |
nudge_y = 2 | |
) + | |
labs(x = paste0(dim_names, dims[1]), | |
y = paste0(dim_names, dims[2])) + | |
scale_size(range = c(4, 7)) + | |
guides(size = guide_legend("proportion")) + | |
ggtitle(gn) + | |
#theme_dark() + | |
theme(panel.grid.major = element_blank(), | |
panel.grid.minor = element_blank(), | |
panel.background = element_rect(fill="white"), #gainsboro | |
axis.text = element_blank(), | |
axis.ticks = element_blank(), | |
axis.title = element_text(size=15), | |
plot.title = element_text(size=18), | |
legend.title = element_text(size=15), | |
legend.text = element_text(size=13)) + | |
coord_fixed() | |
} | |
# Depreciated | |
sce.dotplot <- function(sce, genes, group.by = "cell_type"){ | |
tmp <- lapply(genes, function(g){ | |
exprs <- sce.get.gene.exprs(sce, g, umi = T) | |
if(is.null(exprs)){ | |
return(NULL) | |
}else{ | |
colData(sce) %>% | |
as_tibble() %>% | |
add_column(umi = exprs) %>% | |
mutate(exprs = umi / colSums(assay(sce, "umi"))) %>% | |
mutate(expresses = ifelse(umi > 0 , 1, 0)) %>% | |
dplyr::rename(gr = group.by) %>% | |
group_by(gr) %>% | |
summarise(exprs = mean(exprs), | |
expresses = sum(expresses) / n()) %>% | |
add_column(gene = g) %>% | |
mutate(exprs = ((exprs - min(exprs)) / max(exprs - min(exprs)))) | |
} | |
}) %>% do.call(rbind, .) | |
if(is.null(tmp)) stop("No genes found!\n") | |
tmp = tmp %>% | |
filter(!is.na(gene)) | |
tmp %>% | |
ggplot(aes(x=gene, y=gr)) + | |
geom_point(aes(color=exprs, size=expresses)) + | |
theme(axis.text.x = element_text(angle=90, hjust=1,vjust=.5)) + | |
scale_color_continuous(name = "scaled mean expr.", low="gray", high="red") + | |
scale_size_continuous(name="fraction pos.") + | |
ylab(group.by) + | |
scale_radius(limits = c(0,NA), range=c(0,5)) + | |
scale_x_discrete(limits = genes[genes %in% tmp$gene], | |
breaks = genes[genes %in% tmp$gene]) | |
} | |
sce.plot.dotplot = function(sce, genes, group.by = "cell_type"){ | |
stopifnot(group.by %in% colnames(colData(sce))) | |
genes = unique(genes) | |
use.genes = genes[genes %in% rownames(sce)] | |
stopifnot(length(use.genes) > 0) | |
cd = sce.colData(sce) | |
umi = assay(sce, "umi") | |
exprs = t(t(umi[use.genes, ]) / colSums(umi)) | |
umi = umi[use.genes, ] | |
umi.df = reshape2::melt(as.matrix(umi)) %>% | |
as_tibble() %>% | |
dplyr::rename(gene = Var1, | |
cell_id = Var2, | |
umi = value) %>% | |
mutate_if(is.factor, as.character) | |
ex.df = reshape2::melt(as.matrix(exprs)) %>% | |
as_tibble() %>% | |
dplyr::rename(gene = Var1, | |
cell_id = Var2, | |
exprs = value) %>% | |
mutate_if(is.factor, as.character) | |
ex.df %>% | |
left_join(umi.df, by=c("cell_id", "gene")) %>% | |
left_join(cd, by="cell_id") %>% | |
dplyr::rename(gr = group.by) %>% | |
group_by(gr, gene) %>% | |
summarise(mexprs = mean(exprs), | |
frac.exprs = sum(umi>0) / n()) %>% | |
group_by(gene) %>% | |
mutate(mexprs = mexprs - min(mexprs)) %>% | |
mutate(mexprs = mexprs / max(mexprs)) %>% | |
ggplot(aes(gr, gene, size=frac.exprs, color = mexprs)) + | |
ylim(use.genes) + | |
geom_point() + | |
scale_radius(limits = c(0, NA), range = c(0,5), name="fraction exprs.") + | |
#scale_color_continuous(name = "scaled exprs.", low="gray", high = "red") + | |
scale_color_gradientn(name = "scaled exprs.", colors = c("gray", rev(viridis::inferno(100)))) + | |
theme(axis.text.x = element_text(angle=90, hjust=1, vjust=.5)) + | |
labs(y = "", x = "") | |
} | |
sce.geneid.to.genename <- function(sce, genes){ | |
stopifnot("gene_name" %in% colnames(rowData(sce))) | |
out <- lapply(genes, function(gene){ | |
rownames(rowData(sce))[which(rowData(sce)[,"gene_name"] == gene)] | |
}) %>% do.call("c",.) | |
return(out) | |
} | |
sce.heatmap <- function(sce, genes, order.by = "cell_type", sort.by = "stage.ord", mc.cores = 10){ | |
stopifnot(length(genes) > 1) | |
sce.list <- sce.split.by.col(sce, order.by,n.cores = mc.cores) | |
sce.list <- sce.list[order(names(sce.list))] | |
gene.ids <- sce.geneid.to.genename(sce, genes) | |
print(gene.ids) | |
print("getting exprs") | |
ex.list <- pbmcapply::pbmclapply(names(sce.list), function(i){ | |
x <- sce.list[[i]] | |
pb <- sce.pseudobulk(x, sort.by, mc.cores = 1, min_cells = 2) | |
pb <- pb[,sort(colnames(pb))] | |
pb <- t(t(pb[gene.ids,]) / colSums(pb)) | |
colnames(pb) <- paste0(i, "//", colnames(pb)) | |
return(pb) | |
}, mc.cores = mc.cores) | |
print("combining") | |
ex <- do.call(cbind, ex.list) | |
rownames(ex) <- genes | |
ex.annot <- data.frame( | |
cell.type = str_split_fixed(colnames(ex), "//", 2)[,1], | |
stage = str_split_fixed(colnames(ex), "//", 2)[,2], | |
row.names = colnames(ex) | |
) | |
pheatmap::pheatmap(ex, | |
cluster_cols = F, | |
show_colnames =F, | |
annotation_col = ex.annot, | |
clustering_distance_rows = "correlation", | |
scale = "row") | |
} | |
sce.make.rowData <- function(sce, meta.data){ | |
colnames(meta.data) <- c("gene_id", "gene_name", "ortho_id", "ortho_name", "ortho_type") | |
rw <- tibble( | |
gene_id = rownames(sce) | |
) %>% | |
left_join( | |
meta.data %>% | |
group_by(gene_id, gene_name, ortho_type) %>% | |
summarise(ortho_id = paste0(ortho_id, collapse=","), | |
ortho_name = paste0(ortho_id, collapse=",")) | |
) | |
if(!is.null(ncol(rowData(sce)))){ | |
rowData(sce) <- rw %>% | |
left_join( | |
as.data.frame(rowData(sce)) %>% | |
rownames_to_column("gene_id") | |
) %>% | |
as.data.frame() %>% | |
column_to_rownames("gene_id") | |
}else{ | |
rowData(sce) <- rw %>% as.data.frame() %>% column_to_rownames("gene_id") | |
} | |
return(sce) | |
} | |
sce.colData <- function(sce){ | |
out <- colData(sce) %>% | |
as.data.frame() %>% | |
rownames_to_column("cell_id") %>% | |
as_tibble() | |
colnames(out) <- c("cell_id", colnames(colData(sce))) | |
return(out) | |
} | |
sce.proportion.plot <- function(sce, group.by = "cell_type", along = "stage.ord"){ | |
require(tidyverse) | |
require(SingleCellExperiment) | |
stopifnot(group.by %in% colnames(colData(sce))) | |
stopifnot(along %in% colnames(colData(sce))) | |
if(is.null(cbar.name)){ | |
cbar.name <- fill | |
} | |
sce.colData(sce) %>% | |
dplyr::rename(gr = group.by)%>% | |
dplyr::rename(al = along) %>% | |
group_by(gr, al) %>% | |
summarise(n = n()) %>% | |
group_by(al) %>% | |
mutate(prop = n/sum(n)) %>% | |
ggplot(aes(x = al, y = prop, fill = gr)) + | |
geom_bar(stat="identity") + | |
theme(axis.text.x = element_text(angle=90, hjust=1, vjust=.5)) + | |
scale_fill_discrete(name=cbar.name) | |
} | |
sce.proportion.plot.multi <- function(..., fill = "cell_type", along="HUM corr. stage", feat = "species", ncol=2, cbar.name = NULL, drop=F, scales="fixed"){ | |
require(tidyverse) | |
require(SingleCellExperiment) | |
sce.list <- list(...) | |
if(is.null(cbar.name)){ | |
cbar.name <- fill | |
} | |
for(i in sce.list){ | |
stopifnot(fill %in% colnames(colData(i))) | |
stopifnot(along %in% colnames(colData(i))) | |
} | |
df <- do.call(rbind, lapply(sce.list, function(x){ | |
tmp <- sce.colData(x)[,c(fill, along, feat)] | |
return(tmp) | |
})) | |
df %>% | |
dplyr::rename(fl = fill) %>% | |
dplyr::rename(al = along) %>% | |
dplyr::rename(sp = feat) %>% | |
group_by(fl, al, sp) %>% | |
summarise(n = n()) %>% | |
group_by(al, sp) %>% | |
mutate(prop = n/sum(n)) %>% | |
ggplot(aes(x = al, y = prop, fill = fl)) + | |
geom_bar(stat="identity") + | |
theme(axis.text.x = element_text(angle=90, hjust=1, vjust=.5)) + | |
facet_wrap(vars(sp), ncol=ncol, drop = drop, scales=scales) + | |
scale_fill_discrete(name=cbar.name) | |
} | |
sce.propline <- function(..., facet = "cell_type", group="TissueID", along="HUM corr. stage", feat = "species", ncol=2, cbar.name = NULL, drop=F, scales="fixed", min.cells = 10){ | |
require(tidyverse) | |
require(SingleCellExperiment) | |
sce.list <- list(...) | |
if(is.null(cbar.name)){ | |
cbar.name <- facet | |
} | |
for(i in sce.list){ | |
stopifnot(facet %in% colnames(colData(i))) | |
stopifnot(along %in% colnames(colData(i))) | |
} | |
df <- do.call(rbind, lapply(sce.list, function(x){ | |
tmp <- sce.colData(x)[,c(facet, along, feat, group)] | |
return(tmp) | |
})) | |
df %>% | |
dplyr::rename(fl = facet) %>% | |
dplyr::rename(al = along) %>% | |
dplyr::rename(sp = feat) %>% | |
dplyr::rename(gr = group) %>% | |
group_by(fl, al, sp, gr) %>% | |
summarise(n = n()) %>% | |
filter(n >= min.cells) %>% | |
group_by(al, sp, gr) %>% | |
mutate(prop = n/sum(n)) %>% | |
ggplot(aes(x = al, y = prop, color = sp)) + | |
geom_point() + | |
stat_summary(aes(group=sp), geom = "line", fun.y = "median") + | |
stat_summary(aes(group=sp), geom = "linerange", fun.ymax = max, fun.ymin = min) + | |
theme(axis.text.x = element_text(angle=90, hjust=1, vjust=.5)) + | |
facet_wrap(vars(fl), ncol=ncol, drop = drop, scales=scales) + | |
scale_fill_discrete(name=cbar.name) | |
} | |
sce.pca <- function(sce, n=50){ | |
umi <- assay(sce, "umi") | |
cat("HVG identification\n") | |
hvg <- get.info.genes(umi) | |
cat("Normalizing\n") | |
nrm <- t(sqrt(t(umi[hvg,]) / colSums(umi))) | |
cat("Running PCA\n") | |
set.seed(1234) | |
pca.results <- irlba::prcomp_irlba(t(nrm), n=n) | |
feature.loadings <- pca.results$rotation | |
rownames(feature.loadings) <- hvg | |
cell.embeddings <- pca.results$x | |
rownames(cell.embeddings) <- colnames(umi) | |
colnames(feature.loadings) <- colnames(cell.embeddings) <- paste0("PC", sprintf("%02d", 1:ncol(cell.embeddings))) | |
sce@metadata$hvg <- hvg | |
sce@metadata$feature.loadings <- feature.loadings | |
reducedDim(sce, "pca") <- cell.embeddings | |
return(sce) | |
} | |
sce.smoothed.exprs <- function(sce, g){ | |
stopifnot("pca" %in% names(sce@reducedDims)) | |
# stopifnot(g %in% rownames(sce)) | |
if(length(g) == 0) { | |
return( | |
sce.colData(sce) %>% | |
add_column(smoothed = 0) | |
) | |
} | |
if(g %in% rownames(sce)){ | |
cell.embeddings <- reducedDim(sce, "pca") | |
umis <- assay(sce, "umi")[g,] | |
sf <- colSums(assay(sce,"umi")) | |
a <- (cell.embeddings %*% t( matrix(log1p(umis / sf), nrow = 1) %*% cell.embeddings))[,1] | |
a[a<= 0] <- 0 | |
return( | |
sce.colData(sce) %>% | |
add_column(smoothed = a) | |
) | |
}else{ | |
return( | |
sce.colData(sce) %>% | |
add_column(smoothed = 0) | |
) | |
} | |
} | |
sce.smoothed.exprs.mat <- function(sce, g){ | |
stopifnot("pca" %in% names(sce@reducedDims)) | |
stopifnot(sum(rownames(sce) %in% g) == length(g)) | |
# if(g %in% rownames(sce)){ | |
cell.embeddings <- reducedDim(sce, "pca") | |
umis <- assay(sce, "umi")[g,,drop=F] | |
sf <- colSums(assay(sce,"umi")) | |
a <- (cell.embeddings %*% t( log1p(umis[g,] / sf) %*% cell.embeddings)) | |
a[a <= 0] <- 0 | |
return( | |
a | |
) | |
} | |
sce.run.harmony = function(sce, vars_use = c("batch"), use.redDim = "pca", | |
reference_values = NULL){ | |
stopifnot(use.redDim %in% reducedDimNames(sce)) | |
library(harmony) | |
meta = sce.colData(sce) %>% | |
as.data.frame() %>% | |
column_to_rownames("cell_id") | |
l = reducedDim(sce, use.redDim) | |
set.seed(1234) | |
hrm = HarmonyMatrix(data_mat = l, | |
meta_data = meta, | |
do_pca = FALSE, | |
verbose = T, | |
vars_use=vars_use, | |
reference_values = reference_values) | |
reducedDim(sce, "harmony") = hrm | |
return(sce) | |
} | |
get.nn <- function(x, y=x, k=25, metric="cosine", return.self = F){ | |
if(!return.self){ | |
if(sum(dim(x) == dim(y)) == 2){ | |
if(sum(x == y) == length(x)){ | |
k <- k+1 | |
} | |
} | |
} | |
pd <- reticulate::import("pynndescent") | |
cat("Building index\n") | |
idx <- pd$NNDescent(x, metric = metric) | |
cat("Finding nearest neighbors\n") | |
nn <- idx$query(y, k=as.integer(k)) | |
names(nn) <- c("nn.idx", "nn.dist") | |
nn$nn.idx <- nn$nn.idx + 1 | |
if(!return.self){ | |
nn <- lapply(nn, function(x) x[,2:ncol(x)]) | |
} | |
return(nn) | |
} | |
#' Gets top N markers for each cluster | |
#' | |
#' Uses tf-idf ordering to get the top N markers of each cluster. For each cluster, either the top N or all genes passing the hypergeometric test with the FDR specified, whichever list is smallest. | |
#' | |
#' Term Frequency - Inverse Document Frequency is used in natural language processing to identify terms specific to documents. This function uses the same idea to order genes within a group by how predictive of that group they are. The main advantage of this is that it is extremely fast and gives reasonable results. | |
#' | |
#' To do this, gene expression is binarised in each cell so each cell is either considered to express or not each gene. That is, we replace the counts with \code{toc > zeroCut}. The frequency with which a gene is expressed within the target group is compared to the global frequency to calculate the tf-idf score. We also calculate a multiple hypothesis corrected p-value based on a hypergeometric test, but this is extremely permissive. | |
#' | |
#' @export | |
#' @param toc Table of counts. Must be a sparse matrix. | |
#' @param clusters Vector of length \code{ncol(toc)} giving cluster membership. | |
#' @param N Number of marker genes to return per cluster. | |
#' @param FDR False discover rate to use. | |
#' @param expressCut Value above which a gene is considered expressed. | |
#' @return data.frame with top N markers (or all that pass the hypergeometric test) and their statistics for each cluster. | |
#' @examples | |
#' #Calculate markers of clusters in toy data | |
#' mrks = quickMarkers(scToy$toc,scToy$metaData$clusters) | |
#' \dontrun{ | |
#' #Calculate markers from Seurat (v3) object | |
#' mrks = quickMarkers(srat@assays$RNA@count,[email protected]) | |
#' } | |
quickMarkers = function(toc,clusters,N=10,FDR=0.01,expressCut=0.9){ | |
#Convert to the more manipulable format | |
toc = as(toc,'dgTMatrix') | |
w = which(toc@x>expressCut) | |
#Get the counts in each cluster | |
clCnts = table(clusters) | |
nObs = split(factor(rownames(toc))[toc@i[w]+1],clusters[toc@j[w]+1]) | |
nObs = sapply(nObs,table) | |
#Calculate the observed and total frequency | |
nTot = rowSums(nObs) | |
tf = t(t(nObs)/as.integer(clCnts[colnames(nObs)])) | |
ntf = t(t(nTot - nObs)/as.integer(ncol(toc)-clCnts[colnames(nObs)])) | |
idf = log(ncol(toc)/nTot) | |
score = tf*idf | |
#Calculate p-values | |
qvals = lapply(seq_len(ncol(nObs)),function(e) | |
p.adjust(phyper(nObs[,e]-1,nTot,ncol(toc)-nTot,clCnts[colnames(nObs)[e]],lower.tail=FALSE),method='BH')) | |
qvals = do.call(cbind,qvals) | |
colnames(qvals) = colnames(nObs) | |
#Get gene frequency of second best | |
sndBest = lapply(seq_len(ncol(tf)),function(e) apply(tf[,-e,drop=FALSE],1,max)) | |
sndBest = do.call(cbind,sndBest) | |
colnames(sndBest) = colnames(tf) | |
#And the name | |
sndBestName = lapply(seq_len(ncol(tf)),function(e) (colnames(tf)[-e])[apply(tf[,-e,drop=FALSE],1,which.max)]) | |
sndBestName = do.call(cbind,sndBestName) | |
colnames(sndBestName) = colnames(tf) | |
rownames(sndBestName) = rownames(tf) | |
#Now get the top N for each group | |
w = lapply(seq_len(ncol(nObs)),function(e){ | |
o = order(score[,e],decreasing=TRUE) | |
if(sum(qvals[,e]<FDR)>=N){ | |
o[seq(N)] | |
}else{ | |
o[qvals[o,e]<FDR] | |
} | |
}) | |
#Now construct the data.frame with everything | |
ww = cbind(unlist(w,use.names=FALSE),rep(seq_len(ncol(nObs)),lengths(w))) | |
out = data.frame(gene = rownames(nObs)[ww[,1]], | |
cluster = colnames(nObs)[ww[,2]], | |
geneFrequency = tf[ww], | |
geneFrequencyOutsideCluster = ntf[ww], | |
geneFrequencySecondBest = sndBest[ww], | |
geneFrequencyGlobal = nTot[ww[,1]]/ncol(toc), | |
secondBestClusterName = sndBestName[ww], | |
tfidf = score[ww], | |
idf = idf[ww[,1]], | |
qval = qvals[ww], | |
stringsAsFactors=FALSE) | |
return(out) | |
} | |
sce.tfidf = function(sce, group_col = "cell_type", N = 10, FDR = .01, expressCut = .9){ | |
stopifnot(group_col %in% colnames(colData(sce))) | |
X = assay(sce, "umi") | |
gr = colData(sce)[,group_col] | |
quickMarkers(X, gr, N = N, FDR = FDR, expressCut = expressCut) | |
} | |
# Approximation of expression by SVD | |
CreateSceApprox = setClass("sceApprox", | |
slots = list( | |
D = "matrix", | |
U = "matrix", | |
V = "matrix", | |
colData = "tbl" | |
)) | |
sce.approx.expression = function(sce, n = 100, genes = rownames(sce)){ | |
X = assay(sce, "umi")[genes, ] | |
X = t(X) / colSums(X) | |
X = sqrt(X) | |
x.svd = irlba::irlba(X, nv = n) | |
D = diag(x.svd$d) | |
U = x.svd$u | |
V = x.svd$v | |
rownames(U) = rownames(X) | |
rownames(V) = colnames(X) | |
CreateSceApprox(D = D, U = U, V = V, colData = sce.colData(sce)) | |
} | |
setMethod("show", | |
"sceApprox", | |
function(object){ | |
cat("nCells:", nrow(object@U), "\n") | |
cat("nGenes:", nrow(object@V), "\n") | |
cat("nDims:", ncol(object@U), "\n") | |
cat("\n") | |
cat("colData colnames:\n" ) | |
cat(colnames(object@colData)) | |
}) | |
setGeneric("approximate", | |
def = function(object, gene){ | |
standardGeneric("approximate") | |
}) | |
#` Approximation of expression using SVD on SingleCellExperiment object | |
#` | |
#` @param object SingleCellExperiment object to approximate expression to | |
#` @param gene Gene or genes as character or character vector. | |
#` Needs to be present as rownames in the SCE object | |
setMethod("approximate", | |
"sceApprox", | |
function(object, gene){ | |
stopifnot(all(gene %in% rownames(object@V))) | |
appr = t(object@D %*% t(object@U)) %*% t(object@V[gene, , drop=F]) | |
appr[appr < 0] = 0 | |
return(appr) | |
}) | |
setGeneric("approximate_tbl", | |
def = function(object, gene){ | |
standardGeneric("approximate_tbl") | |
}) | |
setMethod("approximate_tbl", | |
"sceApprox", | |
function(object, gene){ | |
appr = approximate(object, gene) | |
reshape2::melt(appr) %>% | |
dplyr::rename(cell_id = Var1, | |
gene = Var2) %>% | |
mutate(cell_id = as.character(cell_id)) %>% | |
#rownames_to_column("cell_id") %>% | |
left_join(object@colData, by="cell_id") | |
}) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment