Skip to content

Instantly share code, notes, and snippets.

@klprint
Created March 6, 2020 13:20
Show Gist options
  • Save klprint/a0776d7f92f8d29cc233606467eaafb5 to your computer and use it in GitHub Desktop.
Save klprint/a0776d7f92f8d29cc233606467eaafb5 to your computer and use it in GitHub Desktop.
library(hdf5r)
library(Matrix)
source("https://gist.githubusercontent.com/klprint/1ab4468eb3c54abcf0422dec6223b8fc/raw/b4cc33f5b4da25bcc2e678cf46b692fe67605460/single_cell_functions.R")
library(SingleCellExperiment)
library(tidyverse)
sce.raw.pca <- function(sce, k = 100){
umi <- assay(sce, "umi")
cat("Getting informative genes\n")
info.genes <- get.info.genes(umi)
cat("\tSetting info genes\n")
umi <- umi[info.genes, ]
cat("Normalization\n")
nrm <- t(umi) / colSums(umi)
cat("Running PCA\n")
pca <- irlba::prcomp_irlba(nrm, n = k)$x
rownames(pca) <- colnames(umi)
reducedDim(sce, "raw_pca") <- pca
return(sce)
}
sce.split.by.col <- function(sco, column, min.cells = 1, n.cores = 10){
lvls <- unique(colData(sco)[,column])
sco <- sco[,!is.na(colData(sco)[,column])]
out <- pbmcapply::pbmclapply(lvls, function(x){
sel <- colData(sco)[,column] == x
if(sum(sel, na.rm = T) <= min.cells) return(NA)
sco[,sel]
}, mc.cores = n.cores)
names(out) <- lvls
out <- out[!is.na(out)]
return(out)
}
sce.run.liger <- function(sce.list, k = 25, assay.slot = "umi", plot.genes=F){
require(liger)
umi <- lapply(sce.list, function(x) assay(x, assay.slot))
cat("Creating LIGER object\n")
l = createLiger(umi)
cat("Preprocessing LIGER\n")
l = liger::normalize(l)
l = liger::selectGenes(l, do.plot = plot.genes)
l = liger::scaleNotCenter(l)
cat("Running LIGER\n")
l = optimizeALS(l, k = k)
return(l)
}
run.umap <- function(X, n_components = 3, min_dist = .1, n_neighbors = 15){
set.seed(1234)
uwot::umap(
X,
metric = "cosine",
min_dist = min_dist,
n_neighbors = n_neighbors,
verbose = T,
n_components = n_components
)
}
sce.run.umap <- function(sce, use.reduction = "liger", n_components = 3, min_dist = .1, n_neighbors = 15){
out <- run.umap(reducedDim(sce, use.reduction), n_components = n_components, min_dist = min_dist, n_neighbors = n_neighbors)
rownames(out) <- rownames(reducedDim(sce, use.reduction))
reducedDim(sce, paste0("umap", n_components, "d")) <- out
return(sce)
}
liger.to.sce <- function(liger.object, input.sce.list){
out <- SingleCellExperiment(
assays = list(umi = do.call(merge.sparse, [email protected])),
colData = lapply(input.sce.list, colData) %>% do.call(rbind,.),
reducedDims = list(
liger = do.call(rbind, liger.object@H)
)
)
return(out)
}
sce.pointcloud.cat <- function(sce, reduceDimName, categoryName){
emb <- reducedDim(sce, reduceDimName)
ctg <- colData(sce)[,categoryName]
ctg[is.na(ctg)] <- "ND"
emb <- emb[order(ctg), ]
ctg <- sort(ctg)
babyplots::pointCloud(emb, "categories", ctg)
}
sce.pointcloud.gene <- function(sce, reduceDimName, gene, assay="umi"){
emb <- reducedDim(sce, reduceDimName)
val <- log1p((assay(ct.comb["ENSG00000075891", ], assay) / colSums( assay(ct.comb, assay)))[1,])
babyplots::pointCloud(emb, "values", val)
}
sce.pointcloud.val <- function(sce, reduceDimName, val){
emb <- reducedDim(sce, reduceDimName)
babyplots::pointCloud(emb, "values", val)
}
sce.pseudobulk <- function(sce, group.by, min_cells = 50, assay = "umi", mc.cores=10){
splitted <- sce.split.by.col(sce, group.by, min.cells = min_cells)
#print(splitted)
if(length(splitted) == 0){
return(NA)
}
pb <- pbmcapply::pbmclapply(splitted, function(x) rowSums(assay(x, assay)), mc.cores = mc.cores) %>%
do.call(cbind, .)
colnames(pb) <- names(splitted)
return(pb)
}
sce.to.anndata <- function(sce, neighbors_from = "liger", n_neighbors = 30, n_pcs = ncol(reducedDim(sce, neighbors_from)), assay = "umi"){
cat("Creating AnnData Object\n")
sc <- import("scanpy")
ad <- sc$AnnData(X = t(assay(sce, assay)),
obs = as.data.frame(colData(sce)),
obsm = reducedDims(sce)@listData[!startsWith(reducedDimNames(sce), "X_")])
print(ad)
sc$pp$neighbors(ad, n_neighbors = as.integer(n_neighbors), use_rep = neighbors_from, n_pcs = as.integer(n_pcs))
return(ad)
}
sce.diffmap <- function(sce, n_diff_comps = 5, neighbors_from = "liger", n_neighbors = 30, n_pcs = ncol(reducedDim(sce, neighbors_from)), assay = "umi"){
sc <- import("scanpy")
ad <- sce.to.anndata(sce,
n_neighbors = as.integer(n_neighbors),
neighbors_from = neighbors_from,
n_pcs = as.integer(n_pcs),
assay = assay)
cat("Running diffusion map\n")
sc$tl$diffmap(ad, n_comps = as.integer(n_diff_comps))
sce@metadata$ad <- ad$copy()
reducedDim(sce, "X_diffmap") <- ad$obsm["X_diffmap"]
return(sce)
}
sce.dpt <- function(sce, root_cell_index, n_dcs = 5){
stopifnot("X_diffmap" %in% reducedDimNames(sce))
sc <- import("scanpy")
ad <- sce@metadata$ad
ad$uns["iroot"] <- as.integer(root_cell_index-1)
cat("Running DPT\n")
sc$tl$dpt(ad, n_dcs = as.integer(n_dcs))
sce$dpt <- ad$obs["dpt_pseudotime"][,1]
sce$dpt_rank <- rank(sce$dpt, ties.method = "first")
sce$dpt_rank <- sce$dpt_rank - min(sce$dpt_rank)
sce$dpt_rank <- sce$dpt_rank / max(sce$dpt_rank)
return(sce)
}
sce.plot.3d <- function(sce, val, redDim = "umap3d", width = NULL, height=NULL){
coord <- reducedDim(sce, redDim)[,1:3]
if(val %in% colnames(colData(sce))){
value = colData(sce)[,val]
}else if(val %in% rownames(sce)){
value = log1p(assay(sce, "umi")[val, ] / colSums(assay(sce, "umi")))
}else{
stop(paste0("Could not find \"", val, "\" in the object!"))
}
val = value
if(is.numeric(val)){
val[is.na(val)] <- 0
babyplots::pointCloud(
coord,
"values",
val,
width = width,
height = height
)
}else if(is.factor(val) | is.character(val)){
val[is.na(val)] <- "NA"
coord <- coord[order(val), ]
val <- sort(val)
babyplots::pointCloud(
coord,
"categories",
val,
width = width,
height = height
)
}else{
stop("val is neither numeric, nor a character or factor!")
}
}
sce.plot.2d <- function(sce, val, redDim = "umap2d", comps = 1:2, point_size=.1, with_repel = F, repel_size=3, do.sort=F, force.discrete = F){
coord <- reducedDim(sce, redDim)[,comps]
val.orig <- val
has.rowdata <- ifelse(!is.null(ncol(rowData(sce))),
ifelse("gene_name" %in% colnames(rowData(sce)),
ifelse(val %in% rowData(sce)[,"gene_name"],
T,
F),
F),
F)
if(length(val) == ncol(sce)){
value = val
}else if(val %in% colnames(colData(sce))){
value = colData(sce)[,val]
}else if(val %in% rownames(sce)){
value = log1p(assay(sce, "umi")[val, ] / colSums(assay(sce, "umi")))
}else if( has.rowdata ){
val <- rownames(rowData(sce))[which(rowData(sce)[,"gene_name"] == val)]
print(val)
value = log1p(assay(sce, "umi")[val, ] / colSums(assay(sce, "umi")))
}else{
stop(paste0("Could not find \"", val, "\" in the object!"))
}
if(force.discrete) value <- factor(as.character(value), levels = as.character(sort(unique(value))))
dat <- tibble(
cell_id = rownames(coord),
x = coord[,1],
y = coord[,2]
) %>%
left_join(
tibble(
cell_id = rownames(colData(sce)),
val = value
)
)
if(is.numeric(value)){
if(do.sort){
dat <- dat %>% arrange(val)
}
p <- dat %>%
ggplot(aes(x,y,color = val)) +
geom_point(size=point_size)
p <- p +
scale_color_continuous(name = val.orig, low="gray", high="red")
}else{
p <- dat %>%
ggplot(aes(x,y,color = val)) +
geom_point(size=point_size)
p <- p +
scale_color_discrete(name=val.orig) +
guides(color = guide_legend(override.aes = aes(size = 2)))
}
if(with_repel){
p <- p + ggrepel::geom_label_repel(
data = function(x){
x %>% group_by(val) %>%
summarise(x = median(x),
y = median(y))
},
mapping = aes(label = val), size = repel_size
) +
theme(legend.position = "none")
}
return(p)
}
sce.run.liger.full = function(sce, min.cells = 50, k=25, split.by = "batch"){
if(!is.na(ncol(rowData(sce)))) row.data = rowData(sce)
cat("Splitting\n")
data.list = sce.split.by.col(sce, split.by, min.cells = min.cells)
cat("Running liger\n")
data.liger = sce.run.liger(data.list,k=k)
cat("Combining to SCE object\n")
out = liger.to.sce(data.liger, data.list)
tmp.rd = matrix(NA, ncol=k, nrow=nrow(out), dimnames = list(rownames(out), paste0("liger_", 1:k)))
tmp.rd[colnames(data.liger@W), ] = t(data.liger@W)
tmp.rd = as.data.frame(tmp.rd)
if(!is.na(ncol(rowData(sce)))){
rowData(out) <- tmp.rd %>%
rownames_to_column("gene_id") %>%
left_join(
as.data.frame(row.data) %>%
rownames_to_column("gene_id")
) %>%
as.data.frame() %>%
column_to_rownames("gene_id")
}else{
rowData(out) = tmp.rd
}
cat("UMAP")
out = sce.run.umap(out, n_components = 2)
return(out)
}
sce.liger.pheatmap = function(sce, order.col.by = "dpt", annot.cols = NULL){
annot = NULL
if(!is.null(annot.cols)){
annot = as.data.frame(colData(sce)[,annot.cols])
rownames(annot) <- colnames(sce)
colnames(annot) <- annot.cols
}
print(head(annot))
ph.data = t(reducedDim(sce, "liger")[order(colData(sce)[,order.col.by]), ])
rownames(ph.data) = paste0("liger_", 1:nrow(ph.data))
print(head(ph.data)[,1:5])
pheatmap::pheatmap(
ph.data,
show_rownames=T,
show_colnames=F,
cluster_cols=F,
clustering_distance_rows="correlation",
color=colorRampPalette(c("white", "black"))(100),
annotation_col = annot
)
}
sce.subsample.per.group <- function(sce, group.by, n.cells=1000, min.cells = 25, allow.min = F,
mc.ncores = 10){
sce.list <- sce.split.by.col(sce, column = group.by, min.cells = min.cells, n.cores = mc.ncores)
if(!allow.min) sce.list <- sce.list[lapply(sce.list, ncol) %>% do.call("c", .) >= 1000]
sce.list <- lapply(sce.list, function(x){
if(ncol(x) > n.cells){
x <- x[,sample(colnames(x), n.cells, replace = F)]
}
return(x)
})
out <- do.call(cbind, sce.list)
return(out)
}
sce.gene.enrichment <- function(sce,
in.group,
label,
balance.group = NULL,
do.balance = F,
subsample.to = 500,
allow.min.cells = F,
min.cells = 25,
mc.cores=10,
use.hvg = T,
frac.min=.25){
## Subsampling, if wanted
if(do.balance){
sce <- sce.subsample.per.group(sce,
group.by = in.group,
n.cells = subsample.to,
min.cells = min.cells,
allow.min = allow.min.cells,
mc.ncores = mc.cores)
}
##
umi <- assay(sce, "umi")
sf <- colSums(umi)
if(use.hvg) umi <- umi[get.info.genes(umi), ]
group.labels <- colData(sce)[,in.group]
are.ingroup <- group.labels %in% label
test.genes <- rowSums( umi[,are.ingroup] > 0 ) > floor(frac.min * sum(are.ingroup))
umi <- umi[test.genes, ]
out <- pbmcapply::pbmclapply(rownames(umi), function(x){
tibble(
gene_id = x,
log2fc = log2(mean( log1p(umi[x,are.ingroup] / sf[are.ingroup])) / mean(log1p(umi[x,!are.ingroup] / sf[!are.ingroup]))),
p.value = wilcox.test(x = log1p(umi[x,are.ingroup] / sf[are.ingroup]), y = log1p(umi[x,!are.ingroup] / sf[!are.ingroup]), alternative = "greater" )$p.value
)
}, mc.cores = mc.cores) %>%
do.call(rbind, .)
out$p.adjust = p.adjust(out$p.value, method = "BH")
if(!is.null(ncol(rowData(sce)))){
if("gene_name" %in% colnames(rowData(sce))){
out <- out %>% left_join(
as.data.frame(rowData(sce)) %>%
rownames_to_column("gene_id") %>%
select("gene_id", "gene_name")
)
}
}
return(out %>% arrange(desc(log2fc)))
}
sce.all.markers <- function(sce,
in.group,
balance.group = NULL,
do.balance = F,
subsample.to = 500,
allow.min.cells = F,
min.cells = 25,
mc.cores=10,
use.hvg = T,
frac.min=.25){
lvls <- unique(colData(sce)[,in.group])
out <- lapply(lvls, function(i){
cat("Finding genes for ", i, "\n")
sce.gene.enrichment(sce = sce,
in.group = in.group,
label = i,
balance.group = balance.group,
do.balance = do.balance,
subsample.to = subsample.to,
allow.min.cells = allow.min.cells,
min.cells = min.cells,
mc.cores=mc.cores,
use.hvg = use.hvg,
frac.min=frac.min)
})
names(out) <- lvls
return(out)
}
sce.plot.along <- function(sce, val, along = "stage.ord", as.boxplot = F, as.violin = F,
pointsize = .1, color=NULL){
val.orig <- val
has.rowdata <- ifelse(!is.null(ncol(rowData(sce))),
ifelse("gene_name" %in% colnames(rowData(sce)),
ifelse(val %in% rowData(sce)[,"gene_name"],
T,
F),
F),
F)
if(length(val) == ncol(sce)){
value = val
}else if(val %in% rownames(sce)){
value = log1p(assay(sce, "umi")[val, ] / colSums(assay(sce, "umi")))
}else if( has.rowdata ){
val <- rownames(rowData(sce))[which(rowData(sce)[,"gene_name"] == val)]
print(val)
value = log1p(assay(sce, "umi")[val, ] / colSums(assay(sce, "umi")))
}else{
stop(paste0("Could not find \"", val, "\" in the object!"))
}
if(!is.null(color)){
df <- tibble(
cell_id = colnames(sce),
exprs = value
) %>%
left_join(
as.data.frame(colData(sce)) %>%
rownames_to_column("cell_id")
) %>%
select(exprs, along, color) %>%
dplyr::rename(x = along,
group = color)
}else{
df <- tibble(
cell_id = colnames(sce),
exprs = value
) %>%
left_join(
as.data.frame(colData(sce)) %>%
rownames_to_column("cell_id")
) %>%
select(exprs, along, color) %>%
dplyr::rename(x = along)
}
plt <- df %>%
ggplot(aes(x=x, y=exprs)) +
xlab(along)
if(as.boxplot){
plt <- plt + geom_boxplot()
}else if(as.violin){
plt <- plt + geom_violin()
}else if(!is.null(color)){
plt <- plt + geom_point(aes(color=group),
size=pointsize)
}else{
plt <- plt + geom_point(size=pointsize)
}
if(!is.numeric(colData(sce)[,along])){
plt <- plt+
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust=.5))
}
return(plt)
}
sce.get.gene.exprs <- function(sce, gene, umi=F){
out <- NULL
gene.orig <- gene
if(!is.null(ncol(rowData(sce)))){
if("gene_name" %in% colnames(rowData(sce))){
if(gene %in% rowData(sce)[,"gene_name"]){
gene <- rownames(rowData(sce))[which(rowData(sce)[,"gene_name"] == gene)]
}
}
}
if(gene %in% rownames(sce)){
if(!umi){
out <- log1p(assay(sce, "umi")[gene,] / colSums(assay(sce, "umi")))
}else{
out <- assay(sce, "umi")[gene,]
}
}
return(out)
}
sce.dotplot <- function(sce, genes, group.by = "cell_type"){
tmp <- lapply(genes, function(g){
exprs <- sce.get.gene.exprs(sce, g, umi = T)
if(is.null(exprs)){
return(NULL)
}else{
colData(sce) %>%
as_tibble() %>%
add_column(umi = exprs) %>%
mutate(exprs = umi / colSums(assay(sce, "umi"))) %>%
mutate(expresses = ifelse(umi > 0 , 1, 0)) %>%
dplyr::rename(gr = group.by) %>%
group_by(gr) %>%
summarise(exprs = mean(exprs),
expresses = sum(expresses) / n()) %>%
add_column(gene = g) %>%
mutate(exprs = ((exprs - min(exprs)) / max(exprs - min(exprs))))
}
}) %>% do.call(rbind, .)
if(is.null(tmp)) stop("No genes found!\n")
tmp %>%
ggplot(aes(x=gene, y=gr)) +
geom_point(aes(color=exprs, size=expresses)) +
theme(axis.text.x = element_text(angle=90, hjust=1,vjust=.5)) +
scale_color_continuous(name = "scaled mean expr.", low="gray", high="red") +
scale_size_continuous(name="fraction pos.") +
ylab(group.by)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment