Last active
January 30, 2020 12:45
-
-
Save klprint/1ab4468eb3c54abcf0422dec6223b8fc to your computer and use it in GitHub Desktop.
Widely used single cell functions in R
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Taken from Simon | |
# compute variances across column or rows for column-sparse matrices | |
library(Matrix) | |
colVars_spm <- function( spm ) { | |
stopifnot( is( spm, "dgCMatrix" ) ) | |
ans <- sapply( seq.int(spm@Dim[2]), function(j) { | |
mean <- sum( spm@x[ (spm@p[j]+1):spm@p[j+1] ] ) / spm@Dim[1] | |
sum( ( spm@x[ (spm@p[j]+1):spm@p[j+1] ] - mean )^2 ) + | |
mean^2 * ( spm@Dim[1] - ( spm@p[j+1] - spm@p[j] ) ) } ) / ( spm@Dim[1] - 1 ) | |
names(ans) <- spm@Dimnames[[2]] | |
ans | |
} | |
rowVars_spm <- function( spm ) { | |
colVars_spm( t(spm) ) | |
} | |
pca_on_informative_genes <- function( counts, nDim=20 ) { | |
norm_counts <- t(t(counts) / colSums(counts)) | |
gene_means <- rowMeans( norm_counts ) | |
gene_vars <- rowVars_spm( norm_counts ) | |
poisson_vmr <- mean( 1 / colSums( counts ) ) | |
informative_genes <- names(which( | |
gene_vars / gene_means > 1.5 * poisson_vmr )) | |
return( | |
list( | |
pca = irlba::prcomp_irlba( t(sqrt(norm_counts[informative_genes,])), n = nDim)$x, | |
info.genes = informative_genes | |
) | |
) | |
} | |
get.info.genes <- function( counts ) { | |
library(Matrix) | |
stopifnot(!is.null(dim(counts))) | |
norm_counts <- t(t(counts) / Matrix::colSums(counts)) | |
gene_means <- rowMeans( norm_counts ) | |
gene_vars <- rowVars_spm( norm_counts ) | |
poisson_vmr <- mean( 1 / Matrix::colSums( counts ) ) | |
informative_genes <- names(which( | |
gene_vars / gene_means > 1.5 * poisson_vmr )) | |
return( | |
informative_genes | |
) | |
} | |
get.info.genes.batchwise <- function( all_counts, per.cell.batch ){ | |
tmp <- lapply(unique(per.cell.batch), function(pcb){ | |
counts <- all_counts[,per.cell.batch == pcb] | |
get.info.genes(counts) | |
}) | |
print(lapply(tmp, length)) | |
return( | |
unique( | |
do.call("c", tmp) | |
) | |
) | |
} | |
################### | |
do.smooth <- function(pca, raw, gene, alpha=.1){ | |
library(locfit) | |
if(ncol(pca) > 15){ | |
cat("Reducing the number of PCs to 15\nto prevent locfit bug.") | |
pca <- pca[,1:15] | |
} | |
predict( locfit.raw( | |
pca[,1:15], | |
raw[gene,], | |
base = log( colSums(raw) ), | |
alpha = alpha, deg = 1, family = "poisson", ev = dat() ) ) | |
} | |
get.10x.data <- function(path){ | |
out <- list() | |
out$raw <- Seurat::Read10X(path) | |
return(out) | |
} | |
get.h5.data <- function(path){ | |
hf <- hdf5r::H5File$new(path, mode="r") | |
cnts <- Matrix::Matrix(data = hf[["umi/ft/counts"]][,], | |
dimnames = list(hf[["umi/ft/genes"]][], | |
hf[["umi/ft/cells"]][])) | |
datalist <- list() | |
datalist$raw <- cnts | |
return(datalist) | |
} | |
do.preprocess <- function(datalist, nGenes = 500, nCells = 1, nDim = 20){ | |
cat("Subsetting the data for: ") | |
cat(nGenes, "genes per cell and", nCells, "cells per gene minimum\n") | |
datalist$raw <- datalist$raw[ , colSums( datalist$raw>0 ) >= nGenes ] | |
datalist$raw <- datalist$raw[ rowSums(datalist$raw) >= nCells, ] | |
cat("Normalizing\n") | |
datalist$nrm <- t(t(datalist$raw) / colSums(datalist$raw)) | |
cat("Running PCA\n") | |
tmp <- pca_on_informative_genes(datalist$raw, nDim) | |
datalist$pca <- tmp$pca | |
datalist$info.genes <- tmp$info.genes | |
return(datalist) | |
} | |
runUMAP <- function(datalist, nPC = 15){ | |
uwot::umap( datalist$pca[,1:nPC], | |
n_neighbors = 30, | |
min_dist = .3, | |
metric = "cosine" ) | |
} | |
run.seurat <- function(datalist, ident.var.genes = F){ | |
library(Seurat) | |
seu <- CreateSeuratObject( datalist$raw , min.cells = 0, min.features = 0) | |
if(ident.var.genes){ | |
seu <- FindVariableFeatures( seu, selection.method = "vst", nfeatures = length(datalist$info.genes)) | |
}else{ | |
seu[["RNA"]]@var.features <- datalist$info.genes | |
} | |
#seu <- NormalizeData(seu) | |
seu <- ScaleData( seu ) | |
seu <- RunPCA( seu, verbose=FALSE ) | |
seu <- FindNeighbors( seu ) | |
seu <- FindClusters( seu ) | |
return(seu) | |
} | |
rem.batch <- function(mtx, l){ | |
out <- lapply(unique(l), function(x){ | |
cat("Collecting batch from", x, "\n") | |
tmp.mtx <- as.matrix(mtx[,l == x]) | |
# rs <- rowSums(mtx) | |
# names(rs) <- rownames(mtx) | |
rs <- rowSums(tmp.mtx) | |
# tmp.mtx <- sqrt(t( | |
# t(tmp.mtx) / colSums(tmp.mtx) | |
# )) | |
rs <- sqrt(rs / sum(rs)) | |
#rs <- apply(as.matrix(rs), 2, function(x) (x - min(x)) / (max(x - min(x)))) | |
return(rs) | |
#lm(as.matrix(tmp.mtx)~rs)$residuals | |
}) | |
rs.mat <- do.call(cbind, out) | |
cat("Normalizing\n") | |
exprs <- sqrt(t(t(mtx) / colSums(mtx))) | |
#exprs <- apply(as.matrix(exprs), 2, function(x) (x - min(x)) / (max(x - min(x)))) | |
cat("Calculating dot product\n") | |
dp <- t(exprs) %*% rs.mat | |
print(head(dp)) | |
cat("Doing linear regression\n") | |
out <- lm(as.matrix(t(exprs)) ~ as.matrix(dp))$residuals | |
return( | |
out | |
) | |
} | |
do.process.batches <- function( counts, batch.vec ){ | |
out <- list() | |
out$raw <- counts | |
cat("-----------------\n") | |
cat("Getting interesting genes\n") | |
cat("-----------------\n") | |
out$info.genes <- get.info.genes.batchwise(out$raw, batch.vec) | |
cat("-----------------\n") | |
cat("\n") | |
cat("-----------------\n") | |
cat("Batch removal\n") | |
cat("-----------------\n") | |
out$brm <- rem.batch(out$raw, batch.vec) | |
out$nrm <- sqrt(t(t(out$raw) / colSums(out$raw))) | |
cat("-----------------\n") | |
cat("\n") | |
cat("-----------------\n") | |
cat("Running PCA on interesting genes\n") | |
cat("-----------------\n") | |
out$pca <- irlba::prcomp_irlba(out$brm[,out$info.genes], n=20, scale. = F, center = T)$x | |
cat("-----------------\n") | |
cat("\n") | |
cat("-----------------\n") | |
cat("Running UMAP\n") | |
cat("-----------------\n") | |
out$umap <- uwot::umap(out$pca, metric="cosine", min_dist=.3, n_neighbors=30) | |
cat("-----------------\n") | |
out$batch <- batch.vec | |
return(out) | |
} | |
gen.seurat <- function(cnts, brm.mtx, var.genes){ | |
cat("Generating Seurat Object\n") | |
seu <- CreateSeuratObject(cnts) | |
cat("Storing precomputed data\n") | |
seu[["RNA"]]@data <- Matrix(brm.mtx[var.genes,], sparse = T) | |
seu[["RNA"]]@scale.data <- brm.mtx[var.genes,] | |
seu[["RNA"]]@var.features <- var.genes | |
cat("Running PCA\n") | |
seu <- RunPCA(seu) | |
seu <- FindNeighbors(seu) | |
seu <- FindClusters(seu) | |
} | |
project.data <- function(a.brm, b.brm, nDim=30){ | |
cat("Running PCA\n") | |
pca.a <- irlba::prcomp_irlba(a.brm, n = nDim) | |
cat("Projecting data\n") | |
pca.b <- scale(b.brm, center = pca.a$center, scale = pca.a$scale) %*% pca.a$rotation | |
return(rbind(pca.a$x, pca.b)) | |
} | |
project.annotation <- function(source.mtx, querry.mtx, source.annotation, assign.prob = .5){ | |
shared.genes <- intersect(colnames(source.mtx), colnames(querry.mtx)) | |
cat("Number of shared genes:", length(shared.genes), "\n") | |
cat("Running projection PCA\n") | |
cmbd.pca <- project.data(source.mtx[,shared.genes], | |
querry.mtx[,shared.genes]) | |
cat("Finding kNN\n") | |
tst.knn <- FNN::get.knnx(cmbd.pca[1:nrow(source.mtx), ], | |
cmbd.pca[(nrow(source.mtx)+1):nrow(cmbd.pca), ]) | |
cat("Voting\n") | |
tst.cl.vote <- t(apply(tst.knn$nn.index, 1, function(rw) { | |
source.annotation[rw] | |
})) | |
po.cl <- matrix(NA, ncol=length(unique(source.annotation)), nrow = nrow(tst.cl.vote)) | |
colnames(po.cl) <- unique(source.annotation) | |
for(i in 1:nrow(tst.cl.vote)){ | |
tbl <- table(tst.cl.vote[i,]) | |
po.cl[i,names(tbl)] <- as.vector(tbl) | |
} | |
rel.po.cl <- t(apply(po.cl, 1, function(rw){ | |
rw / sum(rw, na.rm = T) | |
})) | |
voted <- apply(rel.po.cl, 1, function(rw){ | |
if(max(rw, na.rm = T) < assign.prob){ | |
return(NA) | |
}else{ | |
y <- colnames(rel.po.cl)[rw == max(rw, na.rm = T)] | |
y[!is.na(y)][1] | |
} | |
}) | |
return(list( | |
voting.mtx = rel.po.cl, | |
result = voted, | |
knn.out = tst.knn | |
)) | |
} | |
union.matrix <- function(m1, m2){ | |
library(Matrix) | |
u.rw <- union(rownames(m1), rownames(m2)) | |
u.cl <- c(colnames(m1), colnames(m2)) | |
cmat <- Matrix(0, ncol=length(u.cl), nrow=length(u.rw), dimnames = list(u.rw, u.cl),sparse=F) | |
cat("Generated matrix with", ncol(cmat), "columns and", nrow(cmat), "rows.\n") | |
cat("Filling in the values\n") | |
cmat[rownames(m1), colnames(m1)] <- m1 | |
cmat[rownames(m2), colnames(m2)] <- m2 | |
return(Matrix(cmat, sparse = T)) | |
} | |
union.matrix.list <- function(...){ | |
library(Matrix) | |
mlist = list(...) | |
out = union.matrix(mlist[[1]], mlist[[2]]) | |
if(length(mlist) > 2){ | |
for(i in 3:length(mlist)){ | |
out = union.matrix(out, mlist[[i]]) | |
} | |
} | |
return(out) | |
} | |
create.h5 <- function(m, | |
filepath, | |
batch = NULL, | |
cluster = NULL, | |
embedding = NULL, | |
indiv_embedding = NULL){ | |
library(hdf5r) | |
h5file <- H5File$new(filepath, mode = "w") | |
h5file[["X"]] <- m | |
h5file[["genes"]] <- rownames(m) | |
h5file[["cells"]] <- colnames(m) | |
if(!is.null(batch)){ | |
h5file[["batch"]] <- batch | |
} | |
if(!is.null(cluster)){ | |
h5file[["cluster"]] <- cluster | |
} | |
if(!is.null(embedding)){ | |
h5file[["embedding"]] <- embedding | |
} | |
if(!is.null(indiv_embedding)){ | |
h5file[["indiv_embedding"]] <- indiv_embedding | |
} | |
h5file$close_all() | |
} | |
merge.sparse <- function(...) { | |
cnnew <- character() | |
rnnew <- character() | |
x <- vector() | |
i <- numeric() | |
j <- numeric() | |
icount <- 1 | |
cat("Merged matrices: ") | |
for (M in list(...)) { | |
cat(icount, " ") | |
cnold <- colnames(M) | |
rnold <- rownames(M) | |
cnnew <- union(cnnew,cnold) | |
rnnew <- union(rnnew,rnold) | |
cindnew <- match(cnold,cnnew) | |
rindnew <- match(rnold,rnnew) | |
ind <- unname(Matrix::which(M != 0,arr.ind=T)) | |
i <- c(i,rindnew[ind[,1]]) | |
j <- c(j,cindnew[ind[,2]]) | |
x <- c(x,M@x) | |
icount <- icount +1 | |
} | |
cat("\n") | |
sparseMatrix(i=i,j=j,x=x,dims=c(length(rnnew),length(cnnew)),dimnames=list(rnnew,cnnew)) | |
} | |
##### New Funcs ##### | |
read.h5 <- function(path, slot = "ft"){ | |
H5 <- H5File$new(path, mode = "r") | |
mtx <- Matrix( | |
H5[[paste0("umi/", slot, "/counts")]][,], | |
dimnames = list( | |
H5[[paste0("umi/", slot, "/genes")]][], | |
H5[[paste0("umi/", slot, "/cells")]][] | |
), | |
sparse = T | |
) | |
H5$close_all() | |
return(mtx) | |
} | |
generate.data.list <- function(...){ | |
dl <- list(...) | |
all.sample.names <- do.call("c", lapply(dl, function(x) unique(as.character(x$orig.ident)))) | |
print(all.sample.names) | |
all.samples <- do.call("c",lapply(dl, function(dat){ | |
lapply(unique(dat$orig.ident), function(x) dat@assays$RNA@counts[,dat$orig.ident == x]) | |
})) | |
names(all.samples) <- all.sample.names | |
return(all.samples) | |
} | |
get.ensembl.names <- function(ids, mart = "mmusculus_gene_ensembl"){ | |
library(biomaRt) | |
mart <- useMart("ensembl", mart) | |
getBM( | |
c("ensembl_gene_id", | |
"external_gene_name"), | |
"ensembl_gene_id", | |
values = ids, | |
mart=mart | |
) | |
} | |
get.ensembl.names.for.list <- function(data.list, mart = "mmusculus_gene_ensembl"){ | |
get.ensembl.names( | |
unique(do.call("c", lapply(data.list, rownames))), | |
mart | |
) | |
} | |
filter.one2one <- function(data.list, | |
genes.meta, | |
translate.to.homolog = F, | |
return.gene.symbol = F){ | |
keep.genes <- genes.meta[genes.meta[,5] == "ortholog_one2one", "Gene.stable.ID"] | |
for(n in names(data.list)){ | |
d <- data.list[[n]] | |
old.n <- rownames(d) | |
d <- d[rownames(d) %in% keep.genes,] | |
if(nrow(d) == 0){ | |
print(n) | |
print(head(old.n)) | |
print(head(keep.genes)) | |
stop("No genes kept!") | |
} | |
if(translate.to.homolog){ | |
new.names <- genes.meta[match(rownames(d), genes.meta[,1]),3] | |
rownames(d) <- new.names | |
} | |
data.list[[n]] <- d | |
} | |
return(data.list) | |
} | |
translate.geneid.to.name <- function(data.list, genes.meta){ | |
for(n in names(data.list)){ | |
d <- data.list[[n]] | |
old.n <- rownames(d) | |
new.n <- as.character(genes.meta$Gene.name[match(old.n, genes.meta$Gene.stable.ID)]) | |
new.n[is.na(new.n)] <- old.n[is.na(new.n)] | |
rownames(d) <- new.n | |
data.list[[n]] <- d | |
} | |
return(data.list) | |
} | |
get.exprs <- function(dl, gene){ | |
exprs <- list() | |
for(i in 1:length(dl)){ | |
nrm <- dl[[i]] | |
if(gene %in% rownames(nrm)){ | |
tmp <- nrm[gene, ] | |
names(tmp) <- colnames(nrm) | |
exprs[[i]] <- tmp | |
}else{ | |
tmp <- rep(0, ncol(nrm)) | |
names(tmp) <- colnames(nrm) | |
exprs[[i]] <- tmp | |
} | |
} | |
exprs <- do.call("c", exprs) | |
return(exprs) | |
} | |
plot.exprs.3d <- function(coord, exprs, size=.1){ | |
require(threejs) | |
require(RColorBrewer) | |
# tmp <- names(exprs) | |
exprs <- scale(exprs) | |
# names(exprs) <- tmp | |
exprs <- round(exprs, digits=2) | |
cols <- rev(colorRampPalette(brewer.pal(5, "Spectral"))(length(sort(unique(exprs))))) | |
coord <- coord[order(exprs), ] | |
exprs <- sort(exprs) | |
cols.vec <- cols[as.numeric(as.factor(exprs))] | |
# coord <- coord[names(exprs), ] | |
scatterplot3js( | |
coord[,1], | |
coord[,2], | |
coord[,3], | |
color = cols.vec, | |
size=size | |
) | |
} | |
plot.factor.3d <- function(coord, color, size=.1){ | |
color <- as.factor(color) | |
require(threejs) | |
require(RColorBrewer) | |
cols <- colorRampPalette(brewer.pal(8, "Set2"))(length(levels(color))) | |
cols.vec <- cols[as.numeric(color)] | |
scatterplot3js( | |
coord[,1], | |
coord[,2], | |
coord[,3], | |
color = cols.vec, | |
size=size, | |
labels = as.character(color) | |
) | |
} | |
scale.batchwise <- function(umi, hvg, n.cores = 10){ | |
cat("Preprocessing UMI matrices\n") | |
d.added <- pbmcapply::pbmclapply(umi, function(x){ | |
x <- x[rownames(x) %in% hvg, ] | |
# print(dim(x)) | |
add.genes <- hvg[!(hvg %in% rownames(x))] | |
# print(length(add.genes)) | |
mg <- Matrix(0, nrow = length(add.genes), ncol = ncol(x)) | |
rownames(mg) <- add.genes | |
colnames(mg) <- colnames(x) | |
# print(dim(mg)) | |
out <- rbind(x, mg) | |
out <- t(t(out) / colSums(out)) | |
return(out[hvg,]) | |
}, mc.cores = 10, ignore.interactive = T) | |
cat("Scaling\n") | |
avgs <- lapply(d.added, rowMeans) | |
ref <- do.call(pmin, avgs) | |
rescale <- pbmcapply::pbmclapply(seq_along(avgs), function(i){ | |
out <- ref/avgs[[i]] | |
out[!is.finite(out)] <- 0 | |
return(out) | |
}, mc.cores = n.cores, ignore.interactive = T) | |
fin <- pbmcapply::pbmclapply(seq_along(d.added), function(i){ | |
m <- d.added[[i]] | |
rs <- rescale[[i]] | |
Matrix(log1p(apply(m, 2, function(x) x * rs)), sparse = T) | |
}, mc.cores = n.cores, ignore.interactive = T) | |
cat("Correcting the matrix\n") | |
mtx <- t(do.call(cbind, fin)) | |
mtx <- mtx[,apply(mtx, 2, sd) != 0] | |
# cat("z-scoring\n") | |
# z.score <- function(x) (x - mean(x)) / sd(x) | |
# mtx.zscore <- apply(mtx, 2, z.score) | |
mtx.zscore <- mtx | |
cat("PCA\n") | |
set.seed(1234) | |
pca <- irlba::prcomp_irlba(mtx.zscore, n=75, verbose = T, scale.=T, center=T) | |
rownames(pca$x) <- rownames(mtx.zscore) | |
rownames(pca$rotation) <- colnames(mtx.zscore) | |
cat("UMAP\n") | |
set.seed(1234) | |
um <- uwot::umap(pca$x, metric="cosine", min_dist = .1, n_neighbors = 15, verbose = T, n_components = 3) | |
rownames(um) <- rownames(mtx.zscore) | |
return( | |
list( | |
pca = pca, | |
umap = um | |
) | |
) | |
} | |
read_merged_data <- function(data){ | |
library(hdf5r) | |
out <- list() | |
out$pca <- readRDS(file.path(data, "pca_model.rds")) | |
out$meta.data <- read_csv(file.path(data, "meta_data.csv")) | |
out$umap <- readRDS(file.path(data, "umap_model.rds")) | |
h5 <- H5File$new(file.path(data, "umi.hdf5"), mode="r") | |
cat("Reading UMI\n") | |
out$umi <- list() | |
for(n in names(h5)){ | |
print(n) | |
out$umi[[n]] <- Matrix( | |
h5[[n]][["counts"]][,], | |
dimnames = list( | |
h5[[n]][["genes"]][], | |
h5[[n]][["cells"]][] | |
), | |
sparse = T | |
) | |
} | |
h5$close_all() | |
out$stage <- do.call("c", lapply(out$umi, function(x) str_split_fixed(colnames(x)[1], "_",4)[,3])) | |
return(out) | |
} | |
make.edgelist <- function(nn){ | |
do.call(rbind, lapply(1:nrow(nn), function(i){ | |
x <- nn[i,] | |
xf <- x[1] | |
y <- x[-1] | |
from <- rep(xf, length(y)) | |
to <- y | |
cbind(from, to) | |
})) | |
} | |
get.clusters <- function(nn){ | |
cat("Generating graph\n") | |
el <- make.edgelist(nn) | |
g <- simplify(graph_from_edgelist(el, directed = F)) | |
cat("Cluster data\n") | |
cl <- cluster_louvain(g) | |
return( | |
list( | |
clusters = membership(cl), | |
graph = g | |
) | |
) | |
} | |
quick.split.cell.names <- function(cell.names, n=4){ | |
str_split_fixed(cell.names, "_", n) | |
} | |
make.pseudobulk <- function(m, group.by){ | |
group.by <- as.factor(group.by) | |
out <- tapply(colnames(m), group.by, function(i){ | |
if(length(i) > 1){ | |
Matrix::rowSums(m[,i]) | |
}else{ | |
m[,1] | |
} | |
}) | |
out <- do.call(cbind, out) | |
return(out) | |
} | |
qsplit <- function(x, n, sep="_"){ | |
str_split_fixed(x, sep, length(strsplit(x[1], sep)[[1]]))[,n] | |
} | |
scanpy.louvain <- function(umi.list, pca.coords, hvg = NULL, resolution = 1, n_pcs = 75L, n_neighbors = 30L){ | |
library(reticulate) | |
use_condaenv("r-reticulate") | |
py_config() | |
cat("Clustering using scanpy\n") | |
sc <- import("scanpy") | |
if(is.list(umi.list)){ | |
mat <- do.call(merge.sparse, umi.list) | |
}else{ | |
mat <- umi.list | |
} | |
mat <- mat[,rownames(pca.coords)] | |
if(!is.null(hvg)){ | |
mat <- mat[rownames(mat) %in% hvg, ] | |
} | |
adata <- sc$AnnData(X = t(mat), | |
obsm = list("PCA" = pca.coords)) | |
cat("\tFinding neighbors\n") | |
sc$pp$neighbors(adata, use_rep = "PCA", n_neighbors = n_neighbors, n_pcs = n_pcs) | |
cat("\tRunning Louvain algorithm\n") | |
sc$tl$louvain(adata, resolution = resolution) | |
cl.orig <- adata$obs[,1] | |
cl <- paste0("c", cl.orig) | |
names(cl.orig) <- colnames(mat) | |
names(cl) <- colnames(mat) | |
return(cl.orig) | |
} | |
split.umi <- function(umi, batch.vec, prune = T){ | |
umi.list <- tapply(colnames(umi), as.factor(batch.vec), function(i){ | |
if(length(i) > 2){ | |
out <- umi[,i] | |
if(prune){ | |
out <- out[rowSums(out) > 0, ] | |
} | |
}else{ | |
out <- NA | |
} | |
return(out) | |
}) | |
umi.list <- umi.list[!is.na(umi.list)] | |
return(umi.list) | |
} | |
do.softmerge <- function(umi.list, features, n_pcs = 75, n_umaps = 3, subsample_pca = NULL, n_cores = 10, batchwise.zscore = T){ | |
if(batchwise.zscore){ | |
scl <- F | |
cntr <- F | |
} | |
cat("Processing a list of matrices with length", length(umi.list), ": ") | |
t.out <- pbmcapply::pbmclapply( umi.list, function(x){ | |
sf <- colSums(x) | |
o <- Matrix(apply(t(t(x[rownames(x) %in% features, ]) / sf), 1, function(y){ | |
z <- sqrt(y / mean(y)) | |
z[is.na(z)] <- 0 | |
return(z) | |
}),sparse=T) | |
return(o) | |
}, mc.cores = n_cores, ignore.interactive = T) | |
cat("\nDone preprocessing \n") | |
cat("Merging matrices\n") | |
t.mat <- do.call(merge.sparse, t.out) | |
#t.mat[is.na(t.mat)] <- 0 | |
t.mat <- t.mat[,colSums(t.mat) > 0] | |
if(batchwise.zscore){ | |
t.mat <- do.call(rbind, pbmcapply::pbmclapply(umi.list, function(u){ | |
cells <- colnames(u) | |
tmp <- t.mat[cells, ] | |
tmp <- scale(tmp) | |
tmp[is.na(tmp)] <- 0 | |
return(tmp) | |
}, mc.cores = n_cores, ignore.interactive = T)) | |
} | |
print(dim(t.mat)) | |
if(is.null(subsample_pca)){ | |
cat("PCA\n") | |
set.seed(1234) | |
t.pca <- irlba::prcomp_irlba(t.mat, n=n_pcs, center = cntr, scale. = scl, verbose=T) | |
rownames(t.pca$x) <- rownames(t.mat) | |
rownames(t.pca$rotation) <- colnames(t.mat) | |
cat("UMAP\n") | |
set.seed(1234) | |
t.umap <- uwot::umap(t.pca$x, metric="cosine", n_neighbors = 30, min_dist=.1, n_components = n_umaps, verbose = T) | |
rownames(t.umap) <- rownames(t.mat) | |
}else{ | |
sel.cells <- sample(rownames(t.mat), subsample_pca) | |
sub.mat <- t.mat[sel.cells, ] | |
sub.mat <- sub.mat[,colSums(sub.mat) > 0] | |
cat("Running PCA with a subsampled dataset of size: ", nrow(sub.mat), ncol(sub.mat), "\n") | |
set.seed(1234) | |
t.pca <- irlba::prcomp_irlba(sub.mat, n=n_pcs, center = cntr, scale. = scl, verbose = T) | |
rownames(t.pca$x) <- sel.cells | |
rownames(t.pca$rotation) <- colnames(sub.mat) | |
cat("UMAP\n") | |
set.seed(1234) | |
t.umap <- uwot::umap(t.pca$x, metric="cosine", n_neighbors = 30, min_dist=.1, n_components = n_umaps, verbose = T) | |
rownames(t.umap) <- sel.cells | |
} | |
return(list( | |
umap = t.umap, | |
pca = t.pca, | |
feats = colnames(t.mat) | |
)) | |
} | |
sparse.columnNorm <- function (A) | |
{ | |
if (class(A)[1] == "dgTMatrix") { | |
temp = summary(A) | |
A = sparseMatrix(i = temp[, 1], j = temp[, 2], x = temp[, | |
3]) | |
} | |
A@x <- A@x/rep.int(Matrix::colSums(A), diff(A@p)) | |
return(A) | |
} | |
write_matrix_h5 <- function(mat, file){ | |
require(hdf5r) | |
H5 <- H5File$new(file, "w") | |
if(is(mat, "dgCMatrix")){ | |
H5[["counts"]] <- summary(mat) | |
}else{ | |
H5[["counts"]] <- mat | |
} | |
H5[["colnames"]] <- colnames(mat) | |
H5[["rownames"]] <- rownames(mat) | |
H5$close_all() | |
} | |
read_sparse_h5 <- function(file){ | |
require(hdf5r) | |
H5 <- H5File$new(file, "r") | |
cnts <- H5[["counts"]][] | |
mtx <- sparseMatrix(i = cnts$i, j = cnts$j, x = cnts$x) | |
colnames(mtx) <- H5[["colnames"]][] | |
rownames(mtx) <- H5[["rownames"]][] | |
H5$close_all() | |
return(mtx) | |
} | |
#' GO enrichment of selected genes | |
#' | |
#' @param interesting.genes Genes to test enrichment for | |
#' @param gene.universe All genes to test against | |
#' @param gene2go.list | |
run.topGO <- function(interesting.genes, gene.universe, gene2go.list, n.out = 25){ | |
require(topGO) | |
gene.list <- as.factor(as.numeric(gene.universe %in% interesting.genes)) | |
names(gene.list) <- gene.universe | |
sampleGO <- new("topGOdata", | |
ontology = "BP", | |
allGenes = gene.list, | |
# geneSel = names(treecut.pt)[treecut.pt == 2], | |
annot = annFUN.gene2GO, | |
gene2GO = gene2go.list) | |
resultFisher <- runTest(sampleGO, algorithm = "classic", statistic = "fisher") | |
return( | |
GenTable(sampleGO, | |
classicFisher = resultFisher, | |
orderBy = "classicFisher", ranksOf = "classicFisher", topNodes = n.out) | |
) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment