klprint · January 30, 2020 12:45
diff --git a/single_cell_functions.R b/single_cell_functions.R
 # Taken from Simon
 # compute variances across column or rows for column-sparse matrices
 library(Matrix)
 colVars_spm <- function( spm ) {
  stopifnot( is( spm, "dgCMatrix" ) )
  ans <- sapply( seq.int(spm@Dim[2]), function(j) {
    mean <- sum( spm@x[ (spm@p[j]+1):spm@p[j+1] ] ) / spm@Dim[1]
    sum( ( spm@x[ (spm@p[j]+1):spm@p[j+1] ] - mean )^2 ) +
      mean^2 * ( spm@Dim[1] - ( spm@p[j+1] - spm@p[j] ) ) } ) / ( spm@Dim[1] - 1 )
  names(ans) <- spm@Dimnames[[2]]
  ans
 }
 rowVars_spm <- function( spm ) {
  colVars_spm( t(spm) )
 }

 pca_on_informative_genes <- function( counts, nDim=20 ) {
  
  norm_counts <- t(t(counts) / colSums(counts))
  
  gene_means <- rowMeans( norm_counts )
  gene_vars <- rowVars_spm( norm_counts )
  poisson_vmr <- mean( 1 / colSums( counts ) )
  
  informative_genes <- names(which( 
    gene_vars / gene_means  >  1.5 * poisson_vmr ))
  
  return(
    list(
      pca = irlba::prcomp_irlba( t(sqrt(norm_counts[informative_genes,])), n = nDim)$x,
      info.genes = informative_genes
    )
  )
  
 }

 get.info.genes <- function( counts ) {
  library(Matrix)
  stopifnot(!is.null(dim(counts)))
  norm_counts <- t(t(counts) / Matrix::colSums(counts))
  
  gene_means <- rowMeans( norm_counts )
  gene_vars <- rowVars_spm( norm_counts )
  poisson_vmr <- mean( 1 / Matrix::colSums( counts ) )
  
  informative_genes <- names(which( 
    gene_vars / gene_means  >  1.5 * poisson_vmr ))
  
  return(
    informative_genes
  )
  
 }

 get.info.genes.batchwise <- function( all_counts, per.cell.batch ){
  tmp <- lapply(unique(per.cell.batch), function(pcb){
    counts <- all_counts[,per.cell.batch == pcb]
    get.info.genes(counts)
  })
  
  print(lapply(tmp, length))
  return(
    unique(
      do.call("c", tmp)
    )
  )
 }

 ###################


 do.smooth <- function(pca, raw, gene, alpha=.1){
  library(locfit)
  if(ncol(pca) > 15){
    cat("Reducing the number of PCs to 15\nto prevent locfit bug.")
    pca <- pca[,1:15]
  }
  
  predict( locfit.raw( 
    pca[,1:15], 
    raw[gene,], 
    base = log( colSums(raw) ),
    alpha = alpha, deg = 1, family = "poisson", ev = dat() ) )
 }


 get.10x.data <- function(path){
  out <- list()
  out$raw <- Seurat::Read10X(path)
  return(out)
 }

 get.h5.data <- function(path){
  hf <- hdf5r::H5File$new(path, mode="r")
  cnts <- Matrix::Matrix(data = hf[["umi/ft/counts"]][,],
                         dimnames = list(hf[["umi/ft/genes"]][],
                                         hf[["umi/ft/cells"]][]))
  
  datalist <- list()
  datalist$raw <- cnts
  return(datalist)
 }


 do.preprocess <- function(datalist, nGenes = 500, nCells = 1, nDim = 20){
  cat("Subsetting the data for: ")
  cat(nGenes, "genes per cell and", nCells, "cells per gene minimum\n")
  datalist$raw <- datalist$raw[ , colSums( datalist$raw>0 ) >= nGenes ]
  datalist$raw <- datalist$raw[ rowSums(datalist$raw) >= nCells,  ]
  
  cat("Normalizing\n")
  datalist$nrm <- t(t(datalist$raw) / colSums(datalist$raw))
  
  cat("Running PCA\n")
  tmp <- pca_on_informative_genes(datalist$raw, nDim)
  datalist$pca <- tmp$pca
  datalist$info.genes <- tmp$info.genes
  
  return(datalist)
 }


 runUMAP <- function(datalist, nPC = 15){
  uwot::umap( datalist$pca[,1:nPC], 
              n_neighbors = 30, 
              min_dist = .3, 
              metric = "cosine" )
 }


 run.seurat <- function(datalist, ident.var.genes = F){
  library(Seurat)
  seu <- CreateSeuratObject( datalist$raw , min.cells = 0, min.features = 0)
  if(ident.var.genes){
    seu <- FindVariableFeatures( seu, selection.method = "vst", nfeatures = length(datalist$info.genes))
  }else{
    seu[["RNA"]]@var.features <- datalist$info.genes
  }
  
  #seu <- NormalizeData(seu)
  seu <- ScaleData( seu )
  seu <- RunPCA( seu, verbose=FALSE )
  seu <- FindNeighbors( seu )
  seu <- FindClusters( seu )
  
  return(seu)
 }


 rem.batch <- function(mtx, l){
  out <- lapply(unique(l), function(x){
    cat("Collecting batch from", x, "\n")
    tmp.mtx <- as.matrix(mtx[,l == x])
    # rs <- rowSums(mtx)
    # names(rs) <- rownames(mtx)
    rs <- rowSums(tmp.mtx)
    
    # tmp.mtx <- sqrt(t(
    #   t(tmp.mtx) / colSums(tmp.mtx)
    # ))
    
    rs <- sqrt(rs / sum(rs))
    #rs <- apply(as.matrix(rs), 2, function(x) (x - min(x)) / (max(x - min(x))))
    
    return(rs)
    
    #lm(as.matrix(tmp.mtx)~rs)$residuals
  })
  
  rs.mat <- do.call(cbind, out)
  
  cat("Normalizing\n")
  exprs <- sqrt(t(t(mtx) / colSums(mtx)))
  #exprs <- apply(as.matrix(exprs), 2, function(x) (x - min(x)) / (max(x - min(x))))
  
  cat("Calculating dot product\n")
  dp <- t(exprs) %*% rs.mat
  
  print(head(dp))
  
  cat("Doing linear regression\n")
  out <- lm(as.matrix(t(exprs)) ~ as.matrix(dp))$residuals
  
  return(
    out
  )
 }


 do.process.batches <- function( counts, batch.vec ){
  out <- list()
  out$raw <- counts
  cat("-----------------\n")
  cat("Getting interesting genes\n")
  cat("-----------------\n")
  out$info.genes <- get.info.genes.batchwise(out$raw, batch.vec)
  cat("-----------------\n")
  cat("\n")
  cat("-----------------\n")
  cat("Batch removal\n")
  cat("-----------------\n")
  out$brm <- rem.batch(out$raw, batch.vec)
  out$nrm <- sqrt(t(t(out$raw) / colSums(out$raw)))
  cat("-----------------\n")
  cat("\n")
  cat("-----------------\n")
  cat("Running PCA on interesting genes\n")
  cat("-----------------\n")
  out$pca <- irlba::prcomp_irlba(out$brm[,out$info.genes], n=20, scale. = F, center = T)$x
  cat("-----------------\n")
  cat("\n")
  cat("-----------------\n")
  cat("Running UMAP\n")
  cat("-----------------\n")
  out$umap <- uwot::umap(out$pca, metric="cosine", min_dist=.3, n_neighbors=30)
  cat("-----------------\n")
  out$batch <- batch.vec
  return(out)
 }


 gen.seurat <- function(cnts, brm.mtx, var.genes){
  cat("Generating Seurat Object\n")
  seu <- CreateSeuratObject(cnts)
  cat("Storing precomputed data\n")
  seu[["RNA"]]@data <-  Matrix(brm.mtx[var.genes,], sparse = T)
  seu[["RNA"]]@scale.data <- brm.mtx[var.genes,]
  seu[["RNA"]]@var.features <- var.genes
  
  cat("Running PCA\n")
  seu <- RunPCA(seu)
  seu <- FindNeighbors(seu)
  seu <- FindClusters(seu)
 }


 project.data <- function(a.brm, b.brm, nDim=30){
  cat("Running PCA\n")
  pca.a <- irlba::prcomp_irlba(a.brm, n = nDim)
  cat("Projecting data\n")
  pca.b <- scale(b.brm, center = pca.a$center, scale = pca.a$scale) %*% pca.a$rotation
  
  return(rbind(pca.a$x, pca.b))
 }

 project.annotation <- function(source.mtx, querry.mtx, source.annotation, assign.prob = .5){
  shared.genes <- intersect(colnames(source.mtx), colnames(querry.mtx))
  cat("Number of shared genes:", length(shared.genes), "\n")
  
  cat("Running projection PCA\n")
  cmbd.pca <- project.data(source.mtx[,shared.genes],
                           querry.mtx[,shared.genes])
  
  cat("Finding kNN\n")
  tst.knn <- FNN::get.knnx(cmbd.pca[1:nrow(source.mtx), ],
                           cmbd.pca[(nrow(source.mtx)+1):nrow(cmbd.pca), ])
  
  cat("Voting\n")
  tst.cl.vote <- t(apply(tst.knn$nn.index, 1, function(rw) {
    source.annotation[rw]
  }))
  
  po.cl <- matrix(NA, ncol=length(unique(source.annotation)), nrow = nrow(tst.cl.vote))
  colnames(po.cl) <- unique(source.annotation)
  for(i in 1:nrow(tst.cl.vote)){
    tbl <- table(tst.cl.vote[i,])
    po.cl[i,names(tbl)] <- as.vector(tbl)
  }
  
  rel.po.cl <- t(apply(po.cl, 1, function(rw){
    rw / sum(rw, na.rm = T)
  }))
  
  voted <- apply(rel.po.cl, 1, function(rw){
    if(max(rw, na.rm = T) < assign.prob){
      return(NA)
    }else{
      y <- colnames(rel.po.cl)[rw == max(rw, na.rm = T)]
      y[!is.na(y)][1]
    }
  })
  
  return(list(
    voting.mtx = rel.po.cl,
    result = voted,
    knn.out = tst.knn
  ))
 }

 union.matrix <- function(m1, m2){
  library(Matrix)
  u.rw <- union(rownames(m1), rownames(m2))
  u.cl <- c(colnames(m1), colnames(m2))
  
  cmat <- Matrix(0, ncol=length(u.cl), nrow=length(u.rw), dimnames = list(u.rw, u.cl),sparse=F)
  
  cat("Generated matrix with", ncol(cmat), "columns and", nrow(cmat), "rows.\n")
  
  cat("Filling in the values\n")
  cmat[rownames(m1), colnames(m1)] <- m1
  cmat[rownames(m2), colnames(m2)] <- m2
  
  return(Matrix(cmat, sparse = T))
 }
                 

 union.matrix.list <- function(...){
    library(Matrix)
    mlist = list(...)
    
    out = union.matrix(mlist[[1]], mlist[[2]])
    
    if(length(mlist) > 2){
       for(i in 3:length(mlist)){
        out = union.matrix(out, mlist[[i]])
       } 
    }
    
    
    return(out)
 }
                 
                 
 create.h5 <- function(m, 
                      filepath, 
                      batch = NULL, 
                      cluster = NULL, 
                      embedding = NULL,
                      indiv_embedding = NULL){
  library(hdf5r)
  h5file <- H5File$new(filepath, mode = "w")
  h5file[["X"]] <- m
  h5file[["genes"]] <- rownames(m)
  h5file[["cells"]] <- colnames(m)
  
  if(!is.null(batch)){
    h5file[["batch"]] <- batch
  }
  
  if(!is.null(cluster)){
    h5file[["cluster"]] <- cluster
  }
  
  if(!is.null(embedding)){
    h5file[["embedding"]] <- embedding
  }
    
  if(!is.null(indiv_embedding)){
      h5file[["indiv_embedding"]] <- indiv_embedding
  }
  
  h5file$close_all()
 }


 merge.sparse <- function(...) {
  
  cnnew <- character()
  rnnew <- character()
  x <- vector()
  i <- numeric()
  j <- numeric()
  
  icount <- 1
  cat("Merged matrices: ")
  for (M in list(...)) {
    cat(icount, " ")
    cnold <- colnames(M)
    rnold <- rownames(M)
    
    cnnew <- union(cnnew,cnold)
    rnnew <- union(rnnew,rnold)
    
    cindnew <- match(cnold,cnnew)
    rindnew <- match(rnold,rnnew)
    ind <- unname(Matrix::which(M != 0,arr.ind=T))
    i <- c(i,rindnew[ind[,1]])
    j <- c(j,cindnew[ind[,2]])
    x <- c(x,M@x)
    
    icount <- icount +1
    
  }
  cat("\n")
  
  sparseMatrix(i=i,j=j,x=x,dims=c(length(rnnew),length(cnnew)),dimnames=list(rnnew,cnnew))
 }

 ##### New Funcs #####
 read.h5 <- function(path, slot = "ft"){
  H5 <- H5File$new(path, mode = "r")
  
  mtx <- Matrix(
    H5[[paste0("umi/", slot, "/counts")]][,],
    dimnames = list(
      H5[[paste0("umi/", slot, "/genes")]][],
      H5[[paste0("umi/", slot, "/cells")]][]
    ),
    sparse = T
  )
  
  H5$close_all()
  
  return(mtx)
 }

 generate.data.list <- function(...){
  dl <- list(...)
  
  all.sample.names <- do.call("c", lapply(dl, function(x) unique(as.character(x$orig.ident))))
  
  print(all.sample.names)
  
  all.samples <- do.call("c",lapply(dl, function(dat){
    lapply(unique(dat$orig.ident), function(x) dat@assays$RNA@counts[,dat$orig.ident == x])
  }))
  
  names(all.samples) <- all.sample.names
  
  return(all.samples)
 }

 get.ensembl.names <- function(ids, mart = "mmusculus_gene_ensembl"){
  library(biomaRt)
  mart <- useMart("ensembl", mart)
  
  getBM(
    c("ensembl_gene_id",
      "external_gene_name"),
    "ensembl_gene_id",
    values = ids,
    mart=mart
  )
 }

 get.ensembl.names.for.list <- function(data.list, mart = "mmusculus_gene_ensembl"){
  get.ensembl.names(
    unique(do.call("c", lapply(data.list, rownames))),
    mart
  )
 }


 filter.one2one <- function(data.list, 
                           genes.meta, 
                           translate.to.homolog = F,
                           return.gene.symbol = F){
  keep.genes <- genes.meta[genes.meta[,5] == "ortholog_one2one", "Gene.stable.ID"]
  
  for(n in names(data.list)){
    d <- data.list[[n]]
    old.n <- rownames(d)
    
    d <- d[rownames(d) %in% keep.genes,]
    
    if(nrow(d) == 0){
      print(n)
      print(head(old.n))
      print(head(keep.genes))
      stop("No genes kept!")
    }
    
    if(translate.to.homolog){
      new.names <- genes.meta[match(rownames(d), genes.meta[,1]),3]
      rownames(d) <- new.names
    }
    
    data.list[[n]] <- d
  }
  
  return(data.list)
 }

 translate.geneid.to.name <- function(data.list, genes.meta){
  for(n in names(data.list)){
    d <- data.list[[n]]
    old.n <- rownames(d)
    new.n <- as.character(genes.meta$Gene.name[match(old.n, genes.meta$Gene.stable.ID)])
    new.n[is.na(new.n)] <- old.n[is.na(new.n)]
    rownames(d) <- new.n
    
    
    data.list[[n]] <- d
  }
  
  return(data.list)
 }

 get.exprs <- function(dl, gene){
  exprs <- list()
  for(i in 1:length(dl)){
    nrm <- dl[[i]]
    if(gene %in% rownames(nrm)){
      tmp <-  nrm[gene, ]
      names(tmp) <- colnames(nrm)
      exprs[[i]] <- tmp
    }else{
      tmp <- rep(0, ncol(nrm))
      names(tmp) <- colnames(nrm)
      exprs[[i]] <- tmp
    }
  }
  exprs <- do.call("c", exprs)
  
  return(exprs)
 }


 plot.exprs.3d <- function(coord, exprs, size=.1){
  require(threejs)
  require(RColorBrewer)
  
  # tmp <- names(exprs)
  exprs <- scale(exprs)
  # names(exprs) <- tmp
  exprs <- round(exprs, digits=2)
  cols <- rev(colorRampPalette(brewer.pal(5, "Spectral"))(length(sort(unique(exprs)))))
  
  coord <- coord[order(exprs), ]
  exprs <- sort(exprs)
  cols.vec <- cols[as.numeric(as.factor(exprs))]
  # coord <- coord[names(exprs), ]
  
  scatterplot3js(
    coord[,1],
    coord[,2],
    coord[,3],
    color = cols.vec,
    size=size
  )
 }

 plot.factor.3d <- function(coord, color, size=.1){
  color <- as.factor(color)
  require(threejs)
  require(RColorBrewer)
  
  cols <- colorRampPalette(brewer.pal(8, "Set2"))(length(levels(color)))
  cols.vec <- cols[as.numeric(color)]
  
  scatterplot3js(
    coord[,1],
    coord[,2],
    coord[,3],
    color = cols.vec,
    size=size,
    labels = as.character(color)
  )
 }

 scale.batchwise <- function(umi, hvg, n.cores = 10){
  cat("Preprocessing UMI matrices\n")
  d.added <- pbmcapply::pbmclapply(umi, function(x){
    x <- x[rownames(x) %in% hvg, ]
    # print(dim(x))
    add.genes <- hvg[!(hvg %in% rownames(x))]
    
    # print(length(add.genes))
    
    
    mg <- Matrix(0, nrow = length(add.genes), ncol = ncol(x))
    rownames(mg) <- add.genes
    colnames(mg) <- colnames(x)
    # print(dim(mg))
    
    out <- rbind(x, mg)
    
    out <- t(t(out) / colSums(out))
    return(out[hvg,])
  }, mc.cores = 10, ignore.interactive = T)
  
  cat("Scaling\n")
  avgs <- lapply(d.added, rowMeans)
  ref <- do.call(pmin, avgs)
  
  rescale <- pbmcapply::pbmclapply(seq_along(avgs), function(i){
    out <- ref/avgs[[i]]
    out[!is.finite(out)] <- 0
    return(out)
  }, mc.cores = n.cores, ignore.interactive = T)
  
  
  fin <- pbmcapply::pbmclapply(seq_along(d.added), function(i){
    m <- d.added[[i]]
    rs <- rescale[[i]]
    
    Matrix(log1p(apply(m, 2, function(x) x * rs)), sparse = T)
  }, mc.cores = n.cores, ignore.interactive = T)
  
  cat("Correcting the matrix\n")
  mtx <- t(do.call(cbind, fin))
  mtx <- mtx[,apply(mtx, 2, sd) != 0]
  
  # cat("z-scoring\n")
  # z.score <- function(x) (x - mean(x)) / sd(x)
  # mtx.zscore <- apply(mtx, 2, z.score)
  mtx.zscore <- mtx
  
  cat("PCA\n")
  set.seed(1234)
  pca <- irlba::prcomp_irlba(mtx.zscore, n=75, verbose = T, scale.=T, center=T)
  rownames(pca$x) <- rownames(mtx.zscore)
  rownames(pca$rotation) <- colnames(mtx.zscore)
  
  cat("UMAP\n")
  set.seed(1234)
  um <- uwot::umap(pca$x, metric="cosine", min_dist = .1, n_neighbors = 15, verbose = T, n_components = 3)
  rownames(um) <- rownames(mtx.zscore)
  
  return(
    list(
      pca = pca,
      umap = um
    )
  )
 }


 read_merged_data <- function(data){
  library(hdf5r)
  out <- list()
  
  out$pca <- readRDS(file.path(data, "pca_model.rds"))
  out$meta.data <- read_csv(file.path(data, "meta_data.csv"))
  out$umap <- readRDS(file.path(data, "umap_model.rds"))
  
  h5 <- H5File$new(file.path(data, "umi.hdf5"), mode="r")
  
  cat("Reading UMI\n")
  out$umi <- list()
  for(n in names(h5)){
    print(n)
    out$umi[[n]] <- Matrix(
      h5[[n]][["counts"]][,],
      dimnames = list(
        h5[[n]][["genes"]][],
        h5[[n]][["cells"]][]
      ),
      sparse = T
    )
  }
  h5$close_all()
  
  out$stage <- do.call("c", lapply(out$umi, function(x) str_split_fixed(colnames(x)[1], "_",4)[,3]))
  return(out)
 }


 make.edgelist <- function(nn){
  do.call(rbind, lapply(1:nrow(nn), function(i){
    x <- nn[i,]
    xf <- x[1]
    y <- x[-1]
    
    from <- rep(xf, length(y))
    to <- y
    
    cbind(from, to)
  }))
 }

 get.clusters <- function(nn){
  cat("Generating graph\n")
  el <- make.edgelist(nn)
  g <- simplify(graph_from_edgelist(el, directed = F))
  
  cat("Cluster data\n")
  cl <- cluster_louvain(g)
  
  return(
    list(
      clusters = membership(cl),
      graph = g
    )
  )
 }

 quick.split.cell.names <- function(cell.names, n=4){
  str_split_fixed(cell.names, "_", n)
 }

 make.pseudobulk <- function(m, group.by){
  group.by <- as.factor(group.by)
  
  out <- tapply(colnames(m), group.by, function(i){
    if(length(i) > 1){
      Matrix::rowSums(m[,i])
    }else{
      m[,1]
    }
  })
  
  out <- do.call(cbind, out)
  
  return(out)
 }

 qsplit <- function(x, n, sep="_"){
  str_split_fixed(x, sep, length(strsplit(x[1], sep)[[1]]))[,n]
 }



 scanpy.louvain <- function(umi.list, pca.coords, hvg = NULL, resolution = 1, n_pcs = 75L, n_neighbors = 30L){
  library(reticulate)
  
  use_condaenv("r-reticulate")
  py_config()
  
  cat("Clustering using scanpy\n")
  sc <- import("scanpy")
  
  if(is.list(umi.list)){
    mat <- do.call(merge.sparse, umi.list)
  }else{
    mat <- umi.list
  }
  mat <- mat[,rownames(pca.coords)]
  
  if(!is.null(hvg)){
    mat <- mat[rownames(mat) %in% hvg, ]
  }
  
  adata <- sc$AnnData(X = t(mat), 
                      obsm = list("PCA" = pca.coords))
  
  cat("\tFinding neighbors\n")
  sc$pp$neighbors(adata, use_rep = "PCA", n_neighbors = n_neighbors, n_pcs = n_pcs)
  cat("\tRunning Louvain algorithm\n")
  sc$tl$louvain(adata, resolution = resolution)
  
  cl.orig <- adata$obs[,1]
  cl <- paste0("c", cl.orig)
  names(cl.orig) <- colnames(mat)
  names(cl) <- colnames(mat)
  
  return(cl.orig)
 }


 split.umi <- function(umi, batch.vec, prune = T){
  
  umi.list <- tapply(colnames(umi), as.factor(batch.vec), function(i){
    
    if(length(i) > 2){
      out <- umi[,i]
      if(prune){
        out <- out[rowSums(out) > 0, ]
      }
    }else{
      out <- NA
    }
    
    return(out)
    
  })
  
  umi.list <- umi.list[!is.na(umi.list)]
  
  return(umi.list)
 }

 do.softmerge <- function(umi.list, features,  n_pcs = 75, n_umaps = 3, subsample_pca = NULL, n_cores = 10, batchwise.zscore = T){
  
  if(batchwise.zscore){
    scl <- F
    cntr <- F
  }
  cat("Processing a list of matrices with length", length(umi.list), ": ")
  
  t.out <- pbmcapply::pbmclapply( umi.list, function(x){
    sf <- colSums(x)
    
    o <- Matrix(apply(t(t(x[rownames(x) %in% features, ]) / sf), 1, function(y){
      z <- sqrt(y / mean(y))
      z[is.na(z)] <- 0
      return(z)
    }),sparse=T)
    
    
    return(o)
    
  }, mc.cores = n_cores, ignore.interactive = T)
  
  cat("\nDone preprocessing \n")
  
  cat("Merging matrices\n")
  t.mat <- do.call(merge.sparse, t.out)
  #t.mat[is.na(t.mat)] <- 0
  t.mat <- t.mat[,colSums(t.mat) > 0]
  
  if(batchwise.zscore){
    
    t.mat <- do.call(rbind, pbmcapply::pbmclapply(umi.list, function(u){
      cells <- colnames(u)
      tmp <- t.mat[cells, ]
      tmp <- scale(tmp)
      tmp[is.na(tmp)] <- 0
      return(tmp)
    }, mc.cores = n_cores, ignore.interactive = T))
    
  }
  
  
  
  print(dim(t.mat))
  
  if(is.null(subsample_pca)){
    
    cat("PCA\n")
    set.seed(1234)
    t.pca <- irlba::prcomp_irlba(t.mat, n=n_pcs, center = cntr, scale. = scl, verbose=T)
    rownames(t.pca$x) <- rownames(t.mat)
    rownames(t.pca$rotation) <- colnames(t.mat)
    cat("UMAP\n")
    set.seed(1234)
    t.umap <- uwot::umap(t.pca$x, metric="cosine", n_neighbors = 30, min_dist=.1, n_components = n_umaps, verbose = T)
    rownames(t.umap) <- rownames(t.mat)
    
  }else{
    
    sel.cells <- sample(rownames(t.mat), subsample_pca)
    
    
    sub.mat <- t.mat[sel.cells, ]
    sub.mat <- sub.mat[,colSums(sub.mat) > 0]
    
    cat("Running PCA with a subsampled dataset of size: ", nrow(sub.mat), ncol(sub.mat), "\n")
    set.seed(1234)
    t.pca <- irlba::prcomp_irlba(sub.mat, n=n_pcs, center = cntr, scale. = scl, verbose = T)
    rownames(t.pca$x) <- sel.cells
    rownames(t.pca$rotation) <- colnames(sub.mat)
    cat("UMAP\n")
    set.seed(1234)
    t.umap <- uwot::umap(t.pca$x, metric="cosine", n_neighbors = 30, min_dist=.1, n_components = n_umaps, verbose = T)
    rownames(t.umap) <- sel.cells
    
  }
  
  
  
  return(list(
    umap = t.umap,
    pca = t.pca,
    feats = colnames(t.mat)
  ))
 }


 sparse.columnNorm <- function (A) 
 {
  if (class(A)[1] == "dgTMatrix") {
    temp = summary(A)
    A = sparseMatrix(i = temp[, 1], j = temp[, 2], x = temp[, 
                                                            3])
  }
  A@x <- A@x/rep.int(Matrix::colSums(A), diff(A@p))
  return(A)
 }
                                   
 write_matrix_h5 <- function(mat, file){
  require(hdf5r)
  H5 <- H5File$new(file, "w")
  if(is(mat, "dgCMatrix")){
    H5[["counts"]] <- summary(mat)
  }else{
    H5[["counts"]] <- mat
  }
  
  H5[["colnames"]] <- colnames(mat)
  H5[["rownames"]] <- rownames(mat)
  H5$close_all()
 }


 read_sparse_h5 <- function(file){
  require(hdf5r)
  H5 <- H5File$new(file, "r")
  cnts <- H5[["counts"]][]
  mtx <- sparseMatrix(i = cnts$i, j = cnts$j, x = cnts$x)
  colnames(mtx) <- H5[["colnames"]][]
  rownames(mtx) <- H5[["rownames"]][]
  H5$close_all()
  return(mtx)
 }
   
 #' GO enrichment of selected genes
 #'
 #' @param interesting.genes Genes to test enrichment for
 #' @param gene.universe All genes to test against
 #' @param gene2go.list 
 run.topGO <- function(interesting.genes, gene.universe, gene2go.list, n.out = 25){
  require(topGO)
  gene.list <- as.factor(as.numeric(gene.universe %in% interesting.genes))
  names(gene.list) <- gene.universe
  
  sampleGO <- new("topGOdata", 
                  ontology = "BP", 
                  allGenes = gene.list,
                  # geneSel = names(treecut.pt)[treecut.pt == 2],
                  annot = annFUN.gene2GO, 
                  gene2GO = gene2go.list)
  resultFisher <- runTest(sampleGO, algorithm = "classic", statistic = "fisher")
  
  return(
    GenTable(sampleGO, 
             classicFisher = resultFisher,
             orderBy = "classicFisher", ranksOf = "classicFisher", topNodes = n.out)
  )
  
 }
	# Taken from Simon
	# compute variances across column or rows for column-sparse matrices
	library(Matrix)
	colVars_spm <- function( spm ) {
	stopifnot( is( spm, "dgCMatrix" ) )
	ans <- sapply( seq.int(spm@Dim[2]), function(j) {
	mean <- sum( spm@x[ (spm@p[j]+1):spm@p[j+1] ] ) / spm@Dim[1]
	sum( ( spm@x[ (spm@p[j]+1):spm@p[j+1] ] - mean )^2 ) +
	mean^2 * ( spm@Dim[1] - ( spm@p[j+1] - spm@p[j] ) ) } ) / ( spm@Dim[1] - 1 )
	names(ans) <- spm@Dimnames[[2]]
	ans
	}
	rowVars_spm <- function( spm ) {
	colVars_spm( t(spm) )
	}

	pca_on_informative_genes <- function( counts, nDim=20 ) {

	norm_counts <- t(t(counts) / colSums(counts))

	gene_means <- rowMeans( norm_counts )
	gene_vars <- rowVars_spm( norm_counts )
	poisson_vmr <- mean( 1 / colSums( counts ) )

	informative_genes <- names(which(
	gene_vars / gene_means > 1.5 * poisson_vmr ))

	return(
	list(
	pca = irlba::prcomp_irlba( t(sqrt(norm_counts[informative_genes,])), n = nDim)$x,
	info.genes = informative_genes
	)
	)

	}

	get.info.genes <- function( counts ) {
	library(Matrix)
	stopifnot(!is.null(dim(counts)))
	norm_counts <- t(t(counts) / Matrix::colSums(counts))

	gene_means <- rowMeans( norm_counts )
	gene_vars <- rowVars_spm( norm_counts )
	poisson_vmr <- mean( 1 / Matrix::colSums( counts ) )

	informative_genes <- names(which(
	gene_vars / gene_means > 1.5 * poisson_vmr ))

	return(
	informative_genes
	)

	}

	get.info.genes.batchwise <- function( all_counts, per.cell.batch ){
	tmp <- lapply(unique(per.cell.batch), function(pcb){
	counts <- all_counts[,per.cell.batch == pcb]
	get.info.genes(counts)
	})

	print(lapply(tmp, length))
	return(
	unique(
	do.call("c", tmp)
	)
	)
	}

	###################


	do.smooth <- function(pca, raw, gene, alpha=.1){
	library(locfit)
	if(ncol(pca) > 15){
	cat("Reducing the number of PCs to 15\nto prevent locfit bug.")
	pca <- pca[,1:15]
	}

	predict( locfit.raw(
	pca[,1:15],
	raw[gene,],
	base = log( colSums(raw) ),
	alpha = alpha, deg = 1, family = "poisson", ev = dat() ) )
	}


	get.10x.data <- function(path){
	out <- list()
	out$raw <- Seurat::Read10X(path)
	return(out)
	}

	get.h5.data <- function(path){
	hf <- hdf5r::H5File$new(path, mode="r")
	cnts <- Matrix::Matrix(data = hf[["umi/ft/counts"]][,],
	dimnames = list(hf[["umi/ft/genes"]][],
	hf[["umi/ft/cells"]][]))

	datalist <- list()
	datalist$raw <- cnts
	return(datalist)
	}


	do.preprocess <- function(datalist, nGenes = 500, nCells = 1, nDim = 20){
	cat("Subsetting the data for: ")
	cat(nGenes, "genes per cell and", nCells, "cells per gene minimum\n")
	datalist$raw <- datalist$raw[ , colSums( datalist$raw>0 ) >= nGenes ]
	datalist$raw <- datalist$raw[ rowSums(datalist$raw) >= nCells, ]

	cat("Normalizing\n")
	datalist$nrm <- t(t(datalist$raw) / colSums(datalist$raw))

	cat("Running PCA\n")
	tmp <- pca_on_informative_genes(datalist$raw, nDim)
	datalist$pca <- tmp$pca
	datalist$info.genes <- tmp$info.genes

	return(datalist)
	}


	runUMAP <- function(datalist, nPC = 15){
	uwot::umap( datalist$pca[,1:nPC],
	n_neighbors = 30,
	min_dist = .3,
	metric = "cosine" )
	}


	run.seurat <- function(datalist, ident.var.genes = F){
	library(Seurat)
	seu <- CreateSeuratObject( datalist$raw , min.cells = 0, min.features = 0)
	if(ident.var.genes){
	seu <- FindVariableFeatures( seu, selection.method = "vst", nfeatures = length(datalist$info.genes))
	}else{
	seu[["RNA"]]@var.features <- datalist$info.genes
	}

	#seu <- NormalizeData(seu)
	seu <- ScaleData( seu )
	seu <- RunPCA( seu, verbose=FALSE )
	seu <- FindNeighbors( seu )
	seu <- FindClusters( seu )

	return(seu)
	}


	rem.batch <- function(mtx, l){
	out <- lapply(unique(l), function(x){
	cat("Collecting batch from", x, "\n")
	tmp.mtx <- as.matrix(mtx[,l == x])
	# rs <- rowSums(mtx)
	# names(rs) <- rownames(mtx)
	rs <- rowSums(tmp.mtx)

	# tmp.mtx <- sqrt(t(
	# t(tmp.mtx) / colSums(tmp.mtx)
	# ))

	rs <- sqrt(rs / sum(rs))
	#rs <- apply(as.matrix(rs), 2, function(x) (x - min(x)) / (max(x - min(x))))

	return(rs)

	#lm(as.matrix(tmp.mtx)~rs)$residuals
	})

	rs.mat <- do.call(cbind, out)

	cat("Normalizing\n")
	exprs <- sqrt(t(t(mtx) / colSums(mtx)))
	#exprs <- apply(as.matrix(exprs), 2, function(x) (x - min(x)) / (max(x - min(x))))

	cat("Calculating dot product\n")
	dp <- t(exprs) %*% rs.mat

	print(head(dp))

	cat("Doing linear regression\n")
	out <- lm(as.matrix(t(exprs)) ~ as.matrix(dp))$residuals

	return(
	out
	)
	}


	do.process.batches <- function( counts, batch.vec ){
	out <- list()
	out$raw <- counts
	cat("-----------------\n")
	cat("Getting interesting genes\n")
	cat("-----------------\n")
	out$info.genes <- get.info.genes.batchwise(out$raw, batch.vec)
	cat("-----------------\n")
	cat("\n")
	cat("-----------------\n")
	cat("Batch removal\n")
	cat("-----------------\n")
	out$brm <- rem.batch(out$raw, batch.vec)
	out$nrm <- sqrt(t(t(out$raw) / colSums(out$raw)))
	cat("-----------------\n")
	cat("\n")
	cat("-----------------\n")
	cat("Running PCA on interesting genes\n")
	cat("-----------------\n")
	out$pca <- irlba::prcomp_irlba(out$brm[,out$info.genes], n=20, scale. = F, center = T)$x
	cat("-----------------\n")
	cat("\n")
	cat("-----------------\n")
	cat("Running UMAP\n")
	cat("-----------------\n")
	out$umap <- uwot::umap(out$pca, metric="cosine", min_dist=.3, n_neighbors=30)
	cat("-----------------\n")
	out$batch <- batch.vec
	return(out)
	}


	gen.seurat <- function(cnts, brm.mtx, var.genes){
	cat("Generating Seurat Object\n")
	seu <- CreateSeuratObject(cnts)
	cat("Storing precomputed data\n")
	seu[["RNA"]]@data <- Matrix(brm.mtx[var.genes,], sparse = T)
	seu[["RNA"]]@scale.data <- brm.mtx[var.genes,]
	seu[["RNA"]]@var.features <- var.genes

	cat("Running PCA\n")
	seu <- RunPCA(seu)
	seu <- FindNeighbors(seu)
	seu <- FindClusters(seu)
	}


	project.data <- function(a.brm, b.brm, nDim=30){
	cat("Running PCA\n")
	pca.a <- irlba::prcomp_irlba(a.brm, n = nDim)
	cat("Projecting data\n")
	pca.b <- scale(b.brm, center = pca.a$center, scale = pca.a$scale) %*% pca.a$rotation

	return(rbind(pca.a$x, pca.b))
	}

	project.annotation <- function(source.mtx, querry.mtx, source.annotation, assign.prob = .5){
	shared.genes <- intersect(colnames(source.mtx), colnames(querry.mtx))
	cat("Number of shared genes:", length(shared.genes), "\n")

	cat("Running projection PCA\n")
	cmbd.pca <- project.data(source.mtx[,shared.genes],
	querry.mtx[,shared.genes])

	cat("Finding kNN\n")
	tst.knn <- FNN::get.knnx(cmbd.pca[1:nrow(source.mtx), ],
	cmbd.pca[(nrow(source.mtx)+1):nrow(cmbd.pca), ])

	cat("Voting\n")
	tst.cl.vote <- t(apply(tst.knn$nn.index, 1, function(rw) {
	source.annotation[rw]
	}))

	po.cl <- matrix(NA, ncol=length(unique(source.annotation)), nrow = nrow(tst.cl.vote))
	colnames(po.cl) <- unique(source.annotation)
	for(i in 1:nrow(tst.cl.vote)){
	tbl <- table(tst.cl.vote[i,])
	po.cl[i,names(tbl)] <- as.vector(tbl)
	}

	rel.po.cl <- t(apply(po.cl, 1, function(rw){
	rw / sum(rw, na.rm = T)
	}))

	voted <- apply(rel.po.cl, 1, function(rw){
	if(max(rw, na.rm = T) < assign.prob){
	return(NA)
	}else{
	y <- colnames(rel.po.cl)[rw == max(rw, na.rm = T)]
	y[!is.na(y)][1]
	}
	})

	return(list(
	voting.mtx = rel.po.cl,
	result = voted,
	knn.out = tst.knn
	))
	}

	union.matrix <- function(m1, m2){
	library(Matrix)
	u.rw <- union(rownames(m1), rownames(m2))
	u.cl <- c(colnames(m1), colnames(m2))

	cmat <- Matrix(0, ncol=length(u.cl), nrow=length(u.rw), dimnames = list(u.rw, u.cl),sparse=F)

	cat("Generated matrix with", ncol(cmat), "columns and", nrow(cmat), "rows.\n")

	cat("Filling in the values\n")
	cmat[rownames(m1), colnames(m1)] <- m1
	cmat[rownames(m2), colnames(m2)] <- m2

	return(Matrix(cmat, sparse = T))
	}


	union.matrix.list <- function(...){
	library(Matrix)
	mlist = list(...)

	out = union.matrix(mlist[[1]], mlist[[2]])

	if(length(mlist) > 2){
	for(i in 3:length(mlist)){
	out = union.matrix(out, mlist[[i]])
	}
	}


	return(out)
	}


	create.h5 <- function(m,
	filepath,
	batch = NULL,
	cluster = NULL,
	embedding = NULL,
	indiv_embedding = NULL){
	library(hdf5r)
	h5file <- H5File$new(filepath, mode = "w")
	h5file[["X"]] <- m
	h5file[["genes"]] <- rownames(m)
	h5file[["cells"]] <- colnames(m)

	if(!is.null(batch)){
	h5file[["batch"]] <- batch
	}

	if(!is.null(cluster)){
	h5file[["cluster"]] <- cluster
	}

	if(!is.null(embedding)){
	h5file[["embedding"]] <- embedding
	}

	if(!is.null(indiv_embedding)){
	h5file[["indiv_embedding"]] <- indiv_embedding
	}

	h5file$close_all()
	}


	merge.sparse <- function(...) {

	cnnew <- character()
	rnnew <- character()
	x <- vector()
	i <- numeric()
	j <- numeric()

	icount <- 1
	cat("Merged matrices: ")
	for (M in list(...)) {
	cat(icount, " ")
	cnold <- colnames(M)
	rnold <- rownames(M)

	cnnew <- union(cnnew,cnold)
	rnnew <- union(rnnew,rnold)

	cindnew <- match(cnold,cnnew)
	rindnew <- match(rnold,rnnew)
	ind <- unname(Matrix::which(M != 0,arr.ind=T))
	i <- c(i,rindnew[ind[,1]])
	j <- c(j,cindnew[ind[,2]])
	x <- c(x,M@x)

	icount <- icount +1

	}
	cat("\n")

	sparseMatrix(i=i,j=j,x=x,dims=c(length(rnnew),length(cnnew)),dimnames=list(rnnew,cnnew))
	}

	##### New Funcs #####
	read.h5 <- function(path, slot = "ft"){
	H5 <- H5File$new(path, mode = "r")

	mtx <- Matrix(
	H5[[paste0("umi/", slot, "/counts")]][,],
	dimnames = list(
	H5[[paste0("umi/", slot, "/genes")]][],
	H5[[paste0("umi/", slot, "/cells")]][]
	),
	sparse = T
	)

	H5$close_all()

	return(mtx)
	}

	generate.data.list <- function(...){
	dl <- list(...)

	all.sample.names <- do.call("c", lapply(dl, function(x) unique(as.character(x$orig.ident))))

	print(all.sample.names)

	all.samples <- do.call("c",lapply(dl, function(dat){
	lapply(unique(dat$orig.ident), function(x) dat@assays$RNA@counts[,dat$orig.ident == x])
	}))

	names(all.samples) <- all.sample.names

	return(all.samples)
	}

	get.ensembl.names <- function(ids, mart = "mmusculus_gene_ensembl"){
	library(biomaRt)
	mart <- useMart("ensembl", mart)

	getBM(
	c("ensembl_gene_id",
	"external_gene_name"),
	"ensembl_gene_id",
	values = ids,
	mart=mart
	)
	}

	get.ensembl.names.for.list <- function(data.list, mart = "mmusculus_gene_ensembl"){
	get.ensembl.names(
	unique(do.call("c", lapply(data.list, rownames))),
	mart
	)
	}


	filter.one2one <- function(data.list,
	genes.meta,
	translate.to.homolog = F,
	return.gene.symbol = F){
	keep.genes <- genes.meta[genes.meta[,5] == "ortholog_one2one", "Gene.stable.ID"]

	for(n in names(data.list)){
	d <- data.list[[n]]
	old.n <- rownames(d)

	d <- d[rownames(d) %in% keep.genes,]

	if(nrow(d) == 0){
	print(n)
	print(head(old.n))
	print(head(keep.genes))
	stop("No genes kept!")
	}

	if(translate.to.homolog){
	new.names <- genes.meta[match(rownames(d), genes.meta[,1]),3]
	rownames(d) <- new.names
	}

	data.list[[n]] <- d
	}

	return(data.list)
	}

	translate.geneid.to.name <- function(data.list, genes.meta){
	for(n in names(data.list)){
	d <- data.list[[n]]
	old.n <- rownames(d)
	new.n <- as.character(genes.meta$Gene.name[match(old.n, genes.meta$Gene.stable.ID)])
	new.n[is.na(new.n)] <- old.n[is.na(new.n)]
	rownames(d) <- new.n


	data.list[[n]] <- d
	}

	return(data.list)
	}

	get.exprs <- function(dl, gene){
	exprs <- list()
	for(i in 1:length(dl)){
	nrm <- dl[[i]]
	if(gene %in% rownames(nrm)){
	tmp <- nrm[gene, ]
	names(tmp) <- colnames(nrm)
	exprs[[i]] <- tmp
	}else{
	tmp <- rep(0, ncol(nrm))
	names(tmp) <- colnames(nrm)
	exprs[[i]] <- tmp
	}
	}
	exprs <- do.call("c", exprs)

	return(exprs)
	}


	plot.exprs.3d <- function(coord, exprs, size=.1){
	require(threejs)
	require(RColorBrewer)

	# tmp <- names(exprs)
	exprs <- scale(exprs)
	# names(exprs) <- tmp
	exprs <- round(exprs, digits=2)
	cols <- rev(colorRampPalette(brewer.pal(5, "Spectral"))(length(sort(unique(exprs)))))

	coord <- coord[order(exprs), ]
	exprs <- sort(exprs)
	cols.vec <- cols[as.numeric(as.factor(exprs))]
	# coord <- coord[names(exprs), ]

	scatterplot3js(
	coord[,1],
	coord[,2],
	coord[,3],
	color = cols.vec,
	size=size
	)
	}

	plot.factor.3d <- function(coord, color, size=.1){
	color <- as.factor(color)
	require(threejs)
	require(RColorBrewer)

	cols <- colorRampPalette(brewer.pal(8, "Set2"))(length(levels(color)))
	cols.vec <- cols[as.numeric(color)]

	scatterplot3js(
	coord[,1],
	coord[,2],
	coord[,3],
	color = cols.vec,
	size=size,
	labels = as.character(color)
	)
	}

	scale.batchwise <- function(umi, hvg, n.cores = 10){
	cat("Preprocessing UMI matrices\n")
	d.added <- pbmcapply::pbmclapply(umi, function(x){
	x <- x[rownames(x) %in% hvg, ]
	# print(dim(x))
	add.genes <- hvg[!(hvg %in% rownames(x))]

	# print(length(add.genes))


	mg <- Matrix(0, nrow = length(add.genes), ncol = ncol(x))
	rownames(mg) <- add.genes
	colnames(mg) <- colnames(x)
	# print(dim(mg))

	out <- rbind(x, mg)

	out <- t(t(out) / colSums(out))
	return(out[hvg,])
	}, mc.cores = 10, ignore.interactive = T)

	cat("Scaling\n")
	avgs <- lapply(d.added, rowMeans)
	ref <- do.call(pmin, avgs)

	rescale <- pbmcapply::pbmclapply(seq_along(avgs), function(i){
	out <- ref/avgs[[i]]
	out[!is.finite(out)] <- 0
	return(out)
	}, mc.cores = n.cores, ignore.interactive = T)


	fin <- pbmcapply::pbmclapply(seq_along(d.added), function(i){
	m <- d.added[[i]]
	rs <- rescale[[i]]

	Matrix(log1p(apply(m, 2, function(x) x * rs)), sparse = T)
	}, mc.cores = n.cores, ignore.interactive = T)

	cat("Correcting the matrix\n")
	mtx <- t(do.call(cbind, fin))
	mtx <- mtx[,apply(mtx, 2, sd) != 0]

	# cat("z-scoring\n")
	# z.score <- function(x) (x - mean(x)) / sd(x)
	# mtx.zscore <- apply(mtx, 2, z.score)
	mtx.zscore <- mtx

	cat("PCA\n")
	set.seed(1234)
	pca <- irlba::prcomp_irlba(mtx.zscore, n=75, verbose = T, scale.=T, center=T)
	rownames(pca$x) <- rownames(mtx.zscore)
	rownames(pca$rotation) <- colnames(mtx.zscore)

	cat("UMAP\n")
	set.seed(1234)
	um <- uwot::umap(pca$x, metric="cosine", min_dist = .1, n_neighbors = 15, verbose = T, n_components = 3)
	rownames(um) <- rownames(mtx.zscore)

	return(
	list(
	pca = pca,
	umap = um
	)
	)
	}


	read_merged_data <- function(data){
	library(hdf5r)
	out <- list()

	out$pca <- readRDS(file.path(data, "pca_model.rds"))
	out$meta.data <- read_csv(file.path(data, "meta_data.csv"))
	out$umap <- readRDS(file.path(data, "umap_model.rds"))

	h5 <- H5File$new(file.path(data, "umi.hdf5"), mode="r")

	cat("Reading UMI\n")
	out$umi <- list()
	for(n in names(h5)){
	print(n)
	out$umi[[n]] <- Matrix(
	h5[[n]][["counts"]][,],
	dimnames = list(
	h5[[n]][["genes"]][],
	h5[[n]][["cells"]][]
	),
	sparse = T
	)
	}
	h5$close_all()

	out$stage <- do.call("c", lapply(out$umi, function(x) str_split_fixed(colnames(x)[1], "_",4)[,3]))
	return(out)
	}


	make.edgelist <- function(nn){
	do.call(rbind, lapply(1:nrow(nn), function(i){
	x <- nn[i,]
	xf <- x[1]
	y <- x[-1]

	from <- rep(xf, length(y))
	to <- y

	cbind(from, to)
	}))
	}

	get.clusters <- function(nn){
	cat("Generating graph\n")
	el <- make.edgelist(nn)
	g <- simplify(graph_from_edgelist(el, directed = F))

	cat("Cluster data\n")
	cl <- cluster_louvain(g)

	return(
	list(
	clusters = membership(cl),
	graph = g
	)
	)
	}

	quick.split.cell.names <- function(cell.names, n=4){
	str_split_fixed(cell.names, "_", n)
	}

	make.pseudobulk <- function(m, group.by){
	group.by <- as.factor(group.by)

	out <- tapply(colnames(m), group.by, function(i){
	if(length(i) > 1){
	Matrix::rowSums(m[,i])
	}else{
	m[,1]
	}
	})

	out <- do.call(cbind, out)

	return(out)
	}

	qsplit <- function(x, n, sep="_"){
	str_split_fixed(x, sep, length(strsplit(x[1], sep)[[1]]))[,n]
	}



	scanpy.louvain <- function(umi.list, pca.coords, hvg = NULL, resolution = 1, n_pcs = 75L, n_neighbors = 30L){
	library(reticulate)

	use_condaenv("r-reticulate")
	py_config()

	cat("Clustering using scanpy\n")
	sc <- import("scanpy")

	if(is.list(umi.list)){
	mat <- do.call(merge.sparse, umi.list)
	}else{
	mat <- umi.list
	}
	mat <- mat[,rownames(pca.coords)]

	if(!is.null(hvg)){
	mat <- mat[rownames(mat) %in% hvg, ]
	}

	adata <- sc$AnnData(X = t(mat),
	obsm = list("PCA" = pca.coords))

	cat("\tFinding neighbors\n")
	sc$pp$neighbors(adata, use_rep = "PCA", n_neighbors = n_neighbors, n_pcs = n_pcs)
	cat("\tRunning Louvain algorithm\n")
	sc$tl$louvain(adata, resolution = resolution)

	cl.orig <- adata$obs[,1]
	cl <- paste0("c", cl.orig)
	names(cl.orig) <- colnames(mat)
	names(cl) <- colnames(mat)

	return(cl.orig)
	}


	split.umi <- function(umi, batch.vec, prune = T){

	umi.list <- tapply(colnames(umi), as.factor(batch.vec), function(i){

	if(length(i) > 2){
	out <- umi[,i]
	if(prune){
	out <- out[rowSums(out) > 0, ]
	}
	}else{
	out <- NA
	}

	return(out)

	})

	umi.list <- umi.list[!is.na(umi.list)]

	return(umi.list)
	}

	do.softmerge <- function(umi.list, features, n_pcs = 75, n_umaps = 3, subsample_pca = NULL, n_cores = 10, batchwise.zscore = T){

	if(batchwise.zscore){
	scl <- F
	cntr <- F
	}
	cat("Processing a list of matrices with length", length(umi.list), ": ")

	t.out <- pbmcapply::pbmclapply( umi.list, function(x){
	sf <- colSums(x)

	o <- Matrix(apply(t(t(x[rownames(x) %in% features, ]) / sf), 1, function(y){
	z <- sqrt(y / mean(y))
	z[is.na(z)] <- 0
	return(z)
	}),sparse=T)


	return(o)

	}, mc.cores = n_cores, ignore.interactive = T)

	cat("\nDone preprocessing \n")

	cat("Merging matrices\n")
	t.mat <- do.call(merge.sparse, t.out)
	#t.mat[is.na(t.mat)] <- 0
	t.mat <- t.mat[,colSums(t.mat) > 0]

	if(batchwise.zscore){

	t.mat <- do.call(rbind, pbmcapply::pbmclapply(umi.list, function(u){
	cells <- colnames(u)
	tmp <- t.mat[cells, ]
	tmp <- scale(tmp)
	tmp[is.na(tmp)] <- 0
	return(tmp)
	}, mc.cores = n_cores, ignore.interactive = T))

	}



	print(dim(t.mat))

	if(is.null(subsample_pca)){

	cat("PCA\n")
	set.seed(1234)
	t.pca <- irlba::prcomp_irlba(t.mat, n=n_pcs, center = cntr, scale. = scl, verbose=T)
	rownames(t.pca$x) <- rownames(t.mat)
	rownames(t.pca$rotation) <- colnames(t.mat)
	cat("UMAP\n")
	set.seed(1234)
	t.umap <- uwot::umap(t.pca$x, metric="cosine", n_neighbors = 30, min_dist=.1, n_components = n_umaps, verbose = T)
	rownames(t.umap) <- rownames(t.mat)

	}else{

	sel.cells <- sample(rownames(t.mat), subsample_pca)


	sub.mat <- t.mat[sel.cells, ]
	sub.mat <- sub.mat[,colSums(sub.mat) > 0]

	cat("Running PCA with a subsampled dataset of size: ", nrow(sub.mat), ncol(sub.mat), "\n")
	set.seed(1234)
	t.pca <- irlba::prcomp_irlba(sub.mat, n=n_pcs, center = cntr, scale. = scl, verbose = T)
	rownames(t.pca$x) <- sel.cells
	rownames(t.pca$rotation) <- colnames(sub.mat)
	cat("UMAP\n")
	set.seed(1234)
	t.umap <- uwot::umap(t.pca$x, metric="cosine", n_neighbors = 30, min_dist=.1, n_components = n_umaps, verbose = T)
	rownames(t.umap) <- sel.cells

	}



	return(list(
	umap = t.umap,
	pca = t.pca,
	feats = colnames(t.mat)
	))
	}


	sparse.columnNorm <- function (A)
	{
	if (class(A)[1] == "dgTMatrix") {
	temp = summary(A)
	A = sparseMatrix(i = temp[, 1], j = temp[, 2], x = temp[,
	3])
	}
	A@x <- A@x/rep.int(Matrix::colSums(A), diff(A@p))
	return(A)
	}

	write_matrix_h5 <- function(mat, file){
	require(hdf5r)
	H5 <- H5File$new(file, "w")
	if(is(mat, "dgCMatrix")){
	H5[["counts"]] <- summary(mat)
	}else{
	H5[["counts"]] <- mat
	}

	H5[["colnames"]] <- colnames(mat)
	H5[["rownames"]] <- rownames(mat)
	H5$close_all()
	}


	read_sparse_h5 <- function(file){
	require(hdf5r)
	H5 <- H5File$new(file, "r")
	cnts <- H5[["counts"]][]
	mtx <- sparseMatrix(i = cnts$i, j = cnts$j, x = cnts$x)
	colnames(mtx) <- H5[["colnames"]][]
	rownames(mtx) <- H5[["rownames"]][]
	H5$close_all()
	return(mtx)
	}

	#' GO enrichment of selected genes
	#'
	#' @param interesting.genes Genes to test enrichment for
	#' @param gene.universe All genes to test against
	#' @param gene2go.list
	run.topGO <- function(interesting.genes, gene.universe, gene2go.list, n.out = 25){
	require(topGO)
	gene.list <- as.factor(as.numeric(gene.universe %in% interesting.genes))
	names(gene.list) <- gene.universe

	sampleGO <- new("topGOdata",
	ontology = "BP",
	allGenes = gene.list,
	# geneSel = names(treecut.pt)[treecut.pt == 2],
	annot = annFUN.gene2GO,
	gene2GO = gene2go.list)
	resultFisher <- runTest(sampleGO, algorithm = "classic", statistic = "fisher")

	return(
	GenTable(sampleGO,
	classicFisher = resultFisher,
	orderBy = "classicFisher", ranksOf = "classicFisher", topNodes = n.out)
	)

	}