klprint · March 6, 2020 13:20
diff --git a/sce_functions.R b/sce_functions.R
 library(hdf5r)
 library(Matrix)
 source("https://gist.githubusercontent.com/klprint/1ab4468eb3c54abcf0422dec6223b8fc/raw/b4cc33f5b4da25bcc2e678cf46b692fe67605460/single_cell_functions.R")
 library(SingleCellExperiment)
 library(tidyverse)

 sce.raw.pca <- function(sce, k = 100){
  umi <- assay(sce, "umi")
  
  cat("Getting informative genes\n")
  info.genes <- get.info.genes(umi)
  
  cat("\tSetting info genes\n")
  umi <- umi[info.genes, ]
  
  cat("Normalization\n")
  nrm <- t(umi) / colSums(umi)
  
  cat("Running PCA\n")
  pca <- irlba::prcomp_irlba(nrm, n = k)$x
  rownames(pca) <- colnames(umi)
  
  reducedDim(sce, "raw_pca") <- pca
  
  return(sce)
 }

 sce.split.by.col <- function(sco, column, min.cells = 1, n.cores = 10){
  lvls <- unique(colData(sco)[,column])

  sco <- sco[,!is.na(colData(sco)[,column])]
  
  out <- pbmcapply::pbmclapply(lvls, function(x){
    sel <- colData(sco)[,column] == x
    
    if(sum(sel, na.rm = T) <= min.cells) return(NA)
    
    sco[,sel]
  }, mc.cores = n.cores)
  
  names(out) <- lvls
  out <- out[!is.na(out)]
  return(out)
 }

 sce.run.liger <- function(sce.list, k = 25, assay.slot = "umi", plot.genes=F){
  require(liger)
  umi <- lapply(sce.list, function(x) assay(x, assay.slot))
  
  cat("Creating LIGER object\n")
  l = createLiger(umi)
  
  cat("Preprocessing LIGER\n")
  l = liger::normalize(l)
  l = liger::selectGenes(l, do.plot = plot.genes)
  l = liger::scaleNotCenter(l)
  
  
  cat("Running LIGER\n")
  l = optimizeALS(l, k = k)
  
  return(l)
 }

 run.umap <- function(X, n_components = 3, min_dist = .1, n_neighbors = 15){
  set.seed(1234)
  uwot::umap(
    X,
    metric = "cosine",
    min_dist = min_dist,
    n_neighbors = n_neighbors,
    verbose = T,
    n_components = n_components
  )
 }

 sce.run.umap <- function(sce, use.reduction = "liger", n_components = 3, min_dist = .1, n_neighbors = 15){
  out <- run.umap(reducedDim(sce, use.reduction), n_components = n_components, min_dist = min_dist, n_neighbors = n_neighbors)
  rownames(out) <- rownames(reducedDim(sce, use.reduction))
  
  reducedDim(sce, paste0("umap", n_components, "d")) <- out
  return(sce)
 }

 liger.to.sce <- function(liger.object, input.sce.list){
  out <- SingleCellExperiment(
    assays = list(umi = do.call(merge.sparse, [email protected])),
    colData = lapply(input.sce.list, colData) %>% do.call(rbind,.),
    reducedDims = list(
      liger = do.call(rbind, liger.object@H)
    )
  )
  
  return(out)
 }

 sce.pointcloud.cat <- function(sce, reduceDimName, categoryName){
  emb <- reducedDim(sce, reduceDimName)
  ctg <- colData(sce)[,categoryName]
  ctg[is.na(ctg)] <- "ND"
  
  emb <- emb[order(ctg), ]
  ctg <- sort(ctg)
  babyplots::pointCloud(emb, "categories", ctg)
 }

 sce.pointcloud.gene <- function(sce, reduceDimName, gene, assay="umi"){
  emb <- reducedDim(sce, reduceDimName)
  
  val <- log1p((assay(ct.comb["ENSG00000075891", ], assay) / colSums( assay(ct.comb, assay)))[1,])
  
  
  babyplots::pointCloud(emb, "values", val)
 }

 sce.pointcloud.val <- function(sce, reduceDimName, val){
  emb <- reducedDim(sce, reduceDimName)
  babyplots::pointCloud(emb, "values", val)
 }

 sce.pseudobulk <- function(sce, group.by, min_cells = 50, assay = "umi", mc.cores=10){
  splitted <- sce.split.by.col(sce, group.by, min.cells = min_cells)
  #print(splitted)
  if(length(splitted) == 0){
      return(NA)
  }
  pb <- pbmcapply::pbmclapply(splitted, function(x) rowSums(assay(x, assay)), mc.cores = mc.cores) %>% 
    do.call(cbind, .)
  
  colnames(pb) <- names(splitted)
  return(pb)
 }

 sce.to.anndata <- function(sce, neighbors_from = "liger", n_neighbors = 30, n_pcs = ncol(reducedDim(sce, neighbors_from)), assay = "umi"){
  cat("Creating AnnData Object\n")
  sc <- import("scanpy")
  
  ad <- sc$AnnData(X = t(assay(sce, assay)),
                   obs = as.data.frame(colData(sce)),
                   obsm = reducedDims(sce)@listData[!startsWith(reducedDimNames(sce), "X_")])
  
  print(ad)
  
  
  
  sc$pp$neighbors(ad, n_neighbors = as.integer(n_neighbors), use_rep = neighbors_from, n_pcs = as.integer(n_pcs))
  
  
  return(ad)
 }

 sce.diffmap <- function(sce, n_diff_comps = 5, neighbors_from = "liger", n_neighbors = 30, n_pcs = ncol(reducedDim(sce, neighbors_from)), assay = "umi"){
  sc <- import("scanpy")
  ad <- sce.to.anndata(sce, 
                       n_neighbors = as.integer(n_neighbors), 
                       neighbors_from = neighbors_from,
                       n_pcs = as.integer(n_pcs),
                       assay = assay)
  
  
  
  cat("Running diffusion map\n")
  sc$tl$diffmap(ad, n_comps = as.integer(n_diff_comps))
  
  sce@metadata$ad <- ad$copy()
  
  reducedDim(sce, "X_diffmap") <- ad$obsm["X_diffmap"]
  return(sce)
 }

 sce.dpt <- function(sce, root_cell_index, n_dcs = 5){
  stopifnot("X_diffmap" %in% reducedDimNames(sce))
  
  sc <- import("scanpy")
  ad <- sce@metadata$ad
  
  ad$uns["iroot"] <- as.integer(root_cell_index-1)
  
  cat("Running DPT\n")
  sc$tl$dpt(ad, n_dcs = as.integer(n_dcs))
  
  sce$dpt <- ad$obs["dpt_pseudotime"][,1]
  sce$dpt_rank <- rank(sce$dpt, ties.method = "first")
  sce$dpt_rank <- sce$dpt_rank - min(sce$dpt_rank)
  sce$dpt_rank <- sce$dpt_rank / max(sce$dpt_rank)
  
  return(sce)
 }

 sce.plot.3d <- function(sce, val, redDim = "umap3d", width = NULL, height=NULL){
  coord <- reducedDim(sce, redDim)[,1:3]
  
  if(val %in% colnames(colData(sce))){
      value = colData(sce)[,val]
  }else if(val %in% rownames(sce)){
      value = log1p(assay(sce, "umi")[val, ] / colSums(assay(sce, "umi")))
  }else{
      stop(paste0("Could not find \"", val, "\" in the object!"))
  }
      
  val = value
  
  if(is.numeric(val)){
    val[is.na(val)] <- 0
    babyplots::pointCloud(
      coord,
      "values",
      val,
      width = width,
      height = height
    )
  }else if(is.factor(val) | is.character(val)){
    val[is.na(val)] <- "NA"
    coord <- coord[order(val), ]
    val <- sort(val)
    babyplots::pointCloud(
      coord,
      "categories",
      val,
      width = width,
      height = height
    )
  }else{
    stop("val is neither numeric, nor a character or factor!")
  }
 }

 sce.plot.2d <- function(sce, val, redDim = "umap2d", comps = 1:2, point_size=.1, with_repel = F, repel_size=3, do.sort=F, force.discrete = F){
  coord <- reducedDim(sce, redDim)[,comps]
  
  val.orig <- val
  
  has.rowdata <- ifelse(!is.null(ncol(rowData(sce))), 
                        ifelse("gene_name" %in% colnames(rowData(sce)), 
                               ifelse(val %in% rowData(sce)[,"gene_name"], 
                                      T, 
                                      F),
                               F), 
                        F)
  
  if(length(val) == ncol(sce)){
      value = val
  }else if(val %in% colnames(colData(sce))){
      value = colData(sce)[,val]
  }else if(val %in% rownames(sce)){
      value = log1p(assay(sce, "umi")[val, ] / colSums(assay(sce, "umi")))
  }else if( has.rowdata ){
      val <- rownames(rowData(sce))[which(rowData(sce)[,"gene_name"] == val)]
      print(val)
      value = log1p(assay(sce, "umi")[val, ] / colSums(assay(sce, "umi")))
  }else{
      stop(paste0("Could not find \"", val, "\" in the object!"))
  }
  
  if(force.discrete) value <- factor(as.character(value), levels = as.character(sort(unique(value))))
  
  dat <-  tibble(
    cell_id = rownames(coord),
    x = coord[,1],
    y = coord[,2]
  ) %>% 
    left_join(
      tibble(
        cell_id = rownames(colData(sce)),
        val = value
      )
    ) 
    
  
  
  
  if(is.numeric(value)){
    if(do.sort){
      dat <- dat %>% arrange(val)
    }
    p <- dat %>% 
      ggplot(aes(x,y,color = val)) +
      geom_point(size=point_size)
    p <- p + 
      scale_color_continuous(name = val.orig, low="gray", high="red")
  }else{
    p <- dat %>% 
      ggplot(aes(x,y,color = val)) +
      geom_point(size=point_size)
    p <- p + 
      scale_color_discrete(name=val.orig) +
      guides(color = guide_legend(override.aes = aes(size = 2)))
  }
  
  if(with_repel){
    p <- p + ggrepel::geom_label_repel(
      data = function(x){
        x %>% group_by(val) %>% 
          summarise(x = median(x),
                    y = median(y))
      },
      mapping = aes(label = val), size = repel_size
    ) +
      theme(legend.position = "none")
  }
  
  return(p)
 }


 sce.run.liger.full = function(sce, min.cells = 50, k=25, split.by = "batch"){
  
    if(!is.na(ncol(rowData(sce)))) row.data = rowData(sce)
    
    cat("Splitting\n")
    data.list = sce.split.by.col(sce, split.by, min.cells = min.cells)
    cat("Running liger\n")
    data.liger = sce.run.liger(data.list,k=k)
    cat("Combining to SCE object\n")
    out = liger.to.sce(data.liger, data.list)
    
    
    tmp.rd = matrix(NA, ncol=k, nrow=nrow(out), dimnames = list(rownames(out), paste0("liger_", 1:k)))
    tmp.rd[colnames(data.liger@W), ] = t(data.liger@W)
    tmp.rd = as.data.frame(tmp.rd)
    
    if(!is.na(ncol(rowData(sce)))){
      rowData(out) <- tmp.rd %>% 
        rownames_to_column("gene_id") %>% 
        left_join(
          as.data.frame(row.data) %>% 
            rownames_to_column("gene_id")
        ) %>% 
        as.data.frame() %>% 
        column_to_rownames("gene_id")
    }else{
      rowData(out) = tmp.rd
    }
    
    
    cat("UMAP")
    out = sce.run.umap(out, n_components = 2)
    
    
    return(out)
 }
                              
 sce.liger.pheatmap = function(sce, order.col.by = "dpt", annot.cols = NULL){
    annot = NULL
    
    if(!is.null(annot.cols)){
        annot = as.data.frame(colData(sce)[,annot.cols])
        rownames(annot) <- colnames(sce)
        colnames(annot) <- annot.cols
    }
    print(head(annot))

    ph.data = t(reducedDim(sce, "liger")[order(colData(sce)[,order.col.by]), ])
    rownames(ph.data) = paste0("liger_", 1:nrow(ph.data))
    
    print(head(ph.data)[,1:5])

    pheatmap::pheatmap(
        ph.data,
        show_rownames=T,
        show_colnames=F,
        cluster_cols=F,
        clustering_distance_rows="correlation",
        color=colorRampPalette(c("white", "black"))(100),
        annotation_col = annot
    )
 }

 sce.subsample.per.group <- function(sce, group.by, n.cells=1000, min.cells = 25, allow.min = F,
                                    mc.ncores = 10){
  
  sce.list <- sce.split.by.col(sce, column = group.by, min.cells = min.cells, n.cores = mc.ncores)
  
  if(!allow.min) sce.list <- sce.list[lapply(sce.list, ncol) %>% do.call("c", .) >= 1000]
  
  sce.list <- lapply(sce.list, function(x){
    if(ncol(x) > n.cells){
      x <- x[,sample(colnames(x), n.cells, replace = F)]
    }
    
    return(x)
  })
  
  out <- do.call(cbind, sce.list)
  return(out)
 }

 sce.gene.enrichment <- function(sce, 
                                in.group, 
                                label, 
                                balance.group = NULL,
                                do.balance = F,
                                subsample.to = 500,
                                allow.min.cells = F,
                                min.cells = 25,
                                mc.cores=10, 
                                use.hvg = T, 
                                frac.min=.25){
  
  
  ## Subsampling, if wanted
  if(do.balance){
    sce <- sce.subsample.per.group(sce, 
                                   group.by = in.group,
                                   n.cells = subsample.to,
                                   min.cells = min.cells,
                                   allow.min = allow.min.cells,
                                   mc.ncores = mc.cores)
  }
  ##
  
  umi <- assay(sce, "umi")
  sf <- colSums(umi)
  if(use.hvg) umi <- umi[get.info.genes(umi), ]
  
  group.labels <- colData(sce)[,in.group]
  are.ingroup <- group.labels %in% label
  
  test.genes <- rowSums( umi[,are.ingroup] > 0 ) > floor(frac.min * sum(are.ingroup))
  
  umi <- umi[test.genes, ]
  
  out <- pbmcapply::pbmclapply(rownames(umi), function(x){
    tibble(
      gene_id = x,
      log2fc = log2(mean( log1p(umi[x,are.ingroup] / sf[are.ingroup])) / mean(log1p(umi[x,!are.ingroup] / sf[!are.ingroup]))),
      p.value = wilcox.test(x = log1p(umi[x,are.ingroup] / sf[are.ingroup]), y =  log1p(umi[x,!are.ingroup] / sf[!are.ingroup]), alternative = "greater" )$p.value
    )
  }, mc.cores = mc.cores) %>% 
    do.call(rbind, .)
  
  out$p.adjust = p.adjust(out$p.value, method = "BH")
  
  if(!is.null(ncol(rowData(sce)))){
    if("gene_name" %in% colnames(rowData(sce))){
      out <- out %>% left_join(
        as.data.frame(rowData(sce)) %>% 
          rownames_to_column("gene_id") %>% 
          select("gene_id", "gene_name")
      )
    }
  }
  
  return(out %>% arrange(desc(log2fc)))
 }

 sce.all.markers <- function(sce, 
                            in.group, 
                            balance.group = NULL,
                            do.balance = F,
                            subsample.to = 500,
                            allow.min.cells = F,
                            min.cells = 25,
                            mc.cores=10, 
                            use.hvg = T, 
                            frac.min=.25){
  
  lvls <- unique(colData(sce)[,in.group])
  
  out <- lapply(lvls, function(i){
    cat("Finding genes for ", i, "\n")
    sce.gene.enrichment(sce = sce, 
                        in.group = in.group, 
                        label = i, 
                        balance.group = balance.group,
                        do.balance = do.balance,
                        subsample.to = subsample.to,
                        allow.min.cells = allow.min.cells,
                        min.cells = min.cells,
                        mc.cores=mc.cores, 
                        use.hvg = use.hvg, 
                        frac.min=frac.min)
  })
  names(out) <- lvls
  
  return(out)
  
 }

 sce.plot.along <- function(sce, val, along = "stage.ord", as.boxplot = F, as.violin = F,
                           pointsize = .1, color=NULL){
  val.orig <- val
  
  has.rowdata <- ifelse(!is.null(ncol(rowData(sce))), 
                        ifelse("gene_name" %in% colnames(rowData(sce)), 
                               ifelse(val %in% rowData(sce)[,"gene_name"], 
                                      T, 
                                      F),
                               F), 
                        F)
  
  if(length(val) == ncol(sce)){
    value = val
  }else if(val %in% rownames(sce)){
    value = log1p(assay(sce, "umi")[val, ] / colSums(assay(sce, "umi")))
  }else if( has.rowdata ){
    val <- rownames(rowData(sce))[which(rowData(sce)[,"gene_name"] == val)]
    print(val)
    value = log1p(assay(sce, "umi")[val, ] / colSums(assay(sce, "umi")))
  }else{
    stop(paste0("Could not find \"", val, "\" in the object!"))
  }
  
  if(!is.null(color)){
    df <- tibble(
      cell_id = colnames(sce),
      exprs = value
    ) %>% 
      left_join(
        as.data.frame(colData(sce)) %>% 
          rownames_to_column("cell_id")
      ) %>% 
      select(exprs, along, color) %>% 
      dplyr::rename(x = along,
                    group = color) 
  }else{
    df <- tibble(
      cell_id = colnames(sce),
      exprs = value
    ) %>% 
      left_join(
        as.data.frame(colData(sce)) %>% 
          rownames_to_column("cell_id")
      ) %>% 
      select(exprs, along, color) %>% 
      dplyr::rename(x = along) 
  }
  
  
  plt <- df %>% 
    ggplot(aes(x=x, y=exprs)) +
    xlab(along)
  
  if(as.boxplot){
    plt <- plt + geom_boxplot()
  }else if(as.violin){
    plt <- plt + geom_violin()
  }else if(!is.null(color)){
    plt <- plt + geom_point(aes(color=group),
                            size=pointsize)
  }else{
    plt <- plt + geom_point(size=pointsize)
  }
  
  if(!is.numeric(colData(sce)[,along])){
    plt <- plt+
      theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust=.5))
  }
  
  return(plt)
  
 }

 sce.get.gene.exprs <- function(sce, gene, umi=F){
  out <- NULL
  gene.orig <- gene
  
  if(!is.null(ncol(rowData(sce)))){
    if("gene_name" %in% colnames(rowData(sce))){
      if(gene %in% rowData(sce)[,"gene_name"]){
        gene <- rownames(rowData(sce))[which(rowData(sce)[,"gene_name"] == gene)]
      }
    }
  }
  
  if(gene %in% rownames(sce)){
    if(!umi){
      out <- log1p(assay(sce, "umi")[gene,] / colSums(assay(sce, "umi")))
    }else{
      out <- assay(sce, "umi")[gene,]
    }
    
  }
  
  return(out)
 }

 sce.dotplot <- function(sce, genes, group.by = "cell_type"){
  tmp <- lapply(genes, function(g){
    
    exprs <- sce.get.gene.exprs(sce, g, umi = T)
    
    if(is.null(exprs)){
      return(NULL)
    }else{
      colData(sce) %>% 
        as_tibble() %>% 
        add_column(umi = exprs) %>% 
        mutate(exprs = umi / colSums(assay(sce, "umi"))) %>% 
        mutate(expresses = ifelse(umi > 0 , 1, 0)) %>% 
        dplyr::rename(gr = group.by) %>% 
        group_by(gr) %>% 
        summarise(exprs = mean(exprs),
                  expresses = sum(expresses) / n()) %>% 
        add_column(gene = g) %>% 
        mutate(exprs = ((exprs - min(exprs)) / max(exprs - min(exprs))))
    }
    
  }) %>% do.call(rbind, .)
  
  if(is.null(tmp)) stop("No genes found!\n")
  
  tmp %>% 
    ggplot(aes(x=gene, y=gr)) + 
    geom_point(aes(color=exprs, size=expresses)) +
    theme(axis.text.x = element_text(angle=90, hjust=1,vjust=.5)) +
    scale_color_continuous(name = "scaled mean expr.", low="gray", high="red") +
    scale_size_continuous(name="fraction pos.") +
    ylab(group.by)
 }
	library(hdf5r)
	library(Matrix)
	source("https://gist.githubusercontent.com/klprint/1ab4468eb3c54abcf0422dec6223b8fc/raw/b4cc33f5b4da25bcc2e678cf46b692fe67605460/single_cell_functions.R")
	library(SingleCellExperiment)
	library(tidyverse)

	sce.raw.pca <- function(sce, k = 100){
	umi <- assay(sce, "umi")

	cat("Getting informative genes\n")
	info.genes <- get.info.genes(umi)

	cat("\tSetting info genes\n")
	umi <- umi[info.genes, ]

	cat("Normalization\n")
	nrm <- t(umi) / colSums(umi)

	cat("Running PCA\n")
	pca <- irlba::prcomp_irlba(nrm, n = k)$x
	rownames(pca) <- colnames(umi)

	reducedDim(sce, "raw_pca") <- pca

	return(sce)
	}

	sce.split.by.col <- function(sco, column, min.cells = 1, n.cores = 10){
	lvls <- unique(colData(sco)[,column])

	sco <- sco[,!is.na(colData(sco)[,column])]

	out <- pbmcapply::pbmclapply(lvls, function(x){
	sel <- colData(sco)[,column] == x

	if(sum(sel, na.rm = T) <= min.cells) return(NA)

	sco[,sel]
	}, mc.cores = n.cores)

	names(out) <- lvls
	out <- out[!is.na(out)]
	return(out)
	}

	sce.run.liger <- function(sce.list, k = 25, assay.slot = "umi", plot.genes=F){
	require(liger)
	umi <- lapply(sce.list, function(x) assay(x, assay.slot))

	cat("Creating LIGER object\n")
	l = createLiger(umi)

	cat("Preprocessing LIGER\n")
	l = liger::normalize(l)
	l = liger::selectGenes(l, do.plot = plot.genes)
	l = liger::scaleNotCenter(l)


	cat("Running LIGER\n")
	l = optimizeALS(l, k = k)

	return(l)
	}

	run.umap <- function(X, n_components = 3, min_dist = .1, n_neighbors = 15){
	set.seed(1234)
	uwot::umap(
	X,
	metric = "cosine",
	min_dist = min_dist,
	n_neighbors = n_neighbors,
	verbose = T,
	n_components = n_components
	)
	}

	sce.run.umap <- function(sce, use.reduction = "liger", n_components = 3, min_dist = .1, n_neighbors = 15){
	out <- run.umap(reducedDim(sce, use.reduction), n_components = n_components, min_dist = min_dist, n_neighbors = n_neighbors)
	rownames(out) <- rownames(reducedDim(sce, use.reduction))

	reducedDim(sce, paste0("umap", n_components, "d")) <- out
	return(sce)
	}

	liger.to.sce <- function(liger.object, input.sce.list){
	out <- SingleCellExperiment(
	assays = list(umi = do.call(merge.sparse, [email protected])),
	colData = lapply(input.sce.list, colData) %>% do.call(rbind,.),
	reducedDims = list(
	liger = do.call(rbind, liger.object@H)
	)
	)

	return(out)
	}

	sce.pointcloud.cat <- function(sce, reduceDimName, categoryName){
	emb <- reducedDim(sce, reduceDimName)
	ctg <- colData(sce)[,categoryName]
	ctg[is.na(ctg)] <- "ND"

	emb <- emb[order(ctg), ]
	ctg <- sort(ctg)
	babyplots::pointCloud(emb, "categories", ctg)
	}

	sce.pointcloud.gene <- function(sce, reduceDimName, gene, assay="umi"){
	emb <- reducedDim(sce, reduceDimName)

	val <- log1p((assay(ct.comb["ENSG00000075891", ], assay) / colSums( assay(ct.comb, assay)))[1,])


	babyplots::pointCloud(emb, "values", val)
	}

	sce.pointcloud.val <- function(sce, reduceDimName, val){
	emb <- reducedDim(sce, reduceDimName)
	babyplots::pointCloud(emb, "values", val)
	}

	sce.pseudobulk <- function(sce, group.by, min_cells = 50, assay = "umi", mc.cores=10){
	splitted <- sce.split.by.col(sce, group.by, min.cells = min_cells)
	#print(splitted)
	if(length(splitted) == 0){
	return(NA)
	}
	pb <- pbmcapply::pbmclapply(splitted, function(x) rowSums(assay(x, assay)), mc.cores = mc.cores) %>%
	do.call(cbind, .)

	colnames(pb) <- names(splitted)
	return(pb)
	}

	sce.to.anndata <- function(sce, neighbors_from = "liger", n_neighbors = 30, n_pcs = ncol(reducedDim(sce, neighbors_from)), assay = "umi"){
	cat("Creating AnnData Object\n")
	sc <- import("scanpy")

	ad <- sc$AnnData(X = t(assay(sce, assay)),
	obs = as.data.frame(colData(sce)),
	obsm = reducedDims(sce)@listData[!startsWith(reducedDimNames(sce), "X_")])

	print(ad)



	sc$pp$neighbors(ad, n_neighbors = as.integer(n_neighbors), use_rep = neighbors_from, n_pcs = as.integer(n_pcs))


	return(ad)
	}

	sce.diffmap <- function(sce, n_diff_comps = 5, neighbors_from = "liger", n_neighbors = 30, n_pcs = ncol(reducedDim(sce, neighbors_from)), assay = "umi"){
	sc <- import("scanpy")
	ad <- sce.to.anndata(sce,
	n_neighbors = as.integer(n_neighbors),
	neighbors_from = neighbors_from,
	n_pcs = as.integer(n_pcs),
	assay = assay)



	cat("Running diffusion map\n")
	sc$tl$diffmap(ad, n_comps = as.integer(n_diff_comps))

	sce@metadata$ad <- ad$copy()

	reducedDim(sce, "X_diffmap") <- ad$obsm["X_diffmap"]
	return(sce)
	}

	sce.dpt <- function(sce, root_cell_index, n_dcs = 5){
	stopifnot("X_diffmap" %in% reducedDimNames(sce))

	sc <- import("scanpy")
	ad <- sce@metadata$ad

	ad$uns["iroot"] <- as.integer(root_cell_index-1)

	cat("Running DPT\n")
	sc$tl$dpt(ad, n_dcs = as.integer(n_dcs))

	sce$dpt <- ad$obs["dpt_pseudotime"][,1]
	sce$dpt_rank <- rank(sce$dpt, ties.method = "first")
	sce$dpt_rank <- sce$dpt_rank - min(sce$dpt_rank)
	sce$dpt_rank <- sce$dpt_rank / max(sce$dpt_rank)

	return(sce)
	}

	sce.plot.3d <- function(sce, val, redDim = "umap3d", width = NULL, height=NULL){
	coord <- reducedDim(sce, redDim)[,1:3]

	if(val %in% colnames(colData(sce))){
	value = colData(sce)[,val]
	}else if(val %in% rownames(sce)){
	value = log1p(assay(sce, "umi")[val, ] / colSums(assay(sce, "umi")))
	}else{
	stop(paste0("Could not find \"", val, "\" in the object!"))
	}

	val = value

	if(is.numeric(val)){
	val[is.na(val)] <- 0
	babyplots::pointCloud(
	coord,
	"values",
	val,
	width = width,
	height = height
	)
	}else if(is.factor(val) \| is.character(val)){
	val[is.na(val)] <- "NA"
	coord <- coord[order(val), ]
	val <- sort(val)
	babyplots::pointCloud(
	coord,
	"categories",
	val,
	width = width,
	height = height
	)
	}else{
	stop("val is neither numeric, nor a character or factor!")
	}
	}

	sce.plot.2d <- function(sce, val, redDim = "umap2d", comps = 1:2, point_size=.1, with_repel = F, repel_size=3, do.sort=F, force.discrete = F){
	coord <- reducedDim(sce, redDim)[,comps]

	val.orig <- val

	has.rowdata <- ifelse(!is.null(ncol(rowData(sce))),
	ifelse("gene_name" %in% colnames(rowData(sce)),
	ifelse(val %in% rowData(sce)[,"gene_name"],
	T,
	F),
	F),
	F)

	if(length(val) == ncol(sce)){
	value = val
	}else if(val %in% colnames(colData(sce))){
	value = colData(sce)[,val]
	}else if(val %in% rownames(sce)){
	value = log1p(assay(sce, "umi")[val, ] / colSums(assay(sce, "umi")))
	}else if( has.rowdata ){
	val <- rownames(rowData(sce))[which(rowData(sce)[,"gene_name"] == val)]
	print(val)
	value = log1p(assay(sce, "umi")[val, ] / colSums(assay(sce, "umi")))
	}else{
	stop(paste0("Could not find \"", val, "\" in the object!"))
	}

	if(force.discrete) value <- factor(as.character(value), levels = as.character(sort(unique(value))))

	dat <- tibble(
	cell_id = rownames(coord),
	x = coord[,1],
	y = coord[,2]
	) %>%
	left_join(
	tibble(
	cell_id = rownames(colData(sce)),
	val = value
	)
	)




	if(is.numeric(value)){
	if(do.sort){
	dat <- dat %>% arrange(val)
	}
	p <- dat %>%
	ggplot(aes(x,y,color = val)) +
	geom_point(size=point_size)
	p <- p +
	scale_color_continuous(name = val.orig, low="gray", high="red")
	}else{
	p <- dat %>%
	ggplot(aes(x,y,color = val)) +
	geom_point(size=point_size)
	p <- p +
	scale_color_discrete(name=val.orig) +
	guides(color = guide_legend(override.aes = aes(size = 2)))
	}

	if(with_repel){
	p <- p + ggrepel::geom_label_repel(
	data = function(x){
	x %>% group_by(val) %>%
	summarise(x = median(x),
	y = median(y))
	},
	mapping = aes(label = val), size = repel_size
	) +
	theme(legend.position = "none")
	}

	return(p)
	}


	sce.run.liger.full = function(sce, min.cells = 50, k=25, split.by = "batch"){

	if(!is.na(ncol(rowData(sce)))) row.data = rowData(sce)

	cat("Splitting\n")
	data.list = sce.split.by.col(sce, split.by, min.cells = min.cells)
	cat("Running liger\n")
	data.liger = sce.run.liger(data.list,k=k)
	cat("Combining to SCE object\n")
	out = liger.to.sce(data.liger, data.list)


	tmp.rd = matrix(NA, ncol=k, nrow=nrow(out), dimnames = list(rownames(out), paste0("liger_", 1:k)))
	tmp.rd[colnames(data.liger@W), ] = t(data.liger@W)
	tmp.rd = as.data.frame(tmp.rd)

	if(!is.na(ncol(rowData(sce)))){
	rowData(out) <- tmp.rd %>%
	rownames_to_column("gene_id") %>%
	left_join(
	as.data.frame(row.data) %>%
	rownames_to_column("gene_id")
	) %>%
	as.data.frame() %>%
	column_to_rownames("gene_id")
	}else{
	rowData(out) = tmp.rd
	}


	cat("UMAP")
	out = sce.run.umap(out, n_components = 2)


	return(out)
	}

	sce.liger.pheatmap = function(sce, order.col.by = "dpt", annot.cols = NULL){
	annot = NULL

	if(!is.null(annot.cols)){
	annot = as.data.frame(colData(sce)[,annot.cols])
	rownames(annot) <- colnames(sce)
	colnames(annot) <- annot.cols
	}
	print(head(annot))

	ph.data = t(reducedDim(sce, "liger")[order(colData(sce)[,order.col.by]), ])
	rownames(ph.data) = paste0("liger_", 1:nrow(ph.data))

	print(head(ph.data)[,1:5])

	pheatmap::pheatmap(
	ph.data,
	show_rownames=T,
	show_colnames=F,
	cluster_cols=F,
	clustering_distance_rows="correlation",
	color=colorRampPalette(c("white", "black"))(100),
	annotation_col = annot
	)
	}

	sce.subsample.per.group <- function(sce, group.by, n.cells=1000, min.cells = 25, allow.min = F,
	mc.ncores = 10){

	sce.list <- sce.split.by.col(sce, column = group.by, min.cells = min.cells, n.cores = mc.ncores)

	if(!allow.min) sce.list <- sce.list[lapply(sce.list, ncol) %>% do.call("c", .) >= 1000]

	sce.list <- lapply(sce.list, function(x){
	if(ncol(x) > n.cells){
	x <- x[,sample(colnames(x), n.cells, replace = F)]
	}

	return(x)
	})

	out <- do.call(cbind, sce.list)
	return(out)
	}

	sce.gene.enrichment <- function(sce,
	in.group,
	label,
	balance.group = NULL,
	do.balance = F,
	subsample.to = 500,
	allow.min.cells = F,
	min.cells = 25,
	mc.cores=10,
	use.hvg = T,
	frac.min=.25){


	## Subsampling, if wanted
	if(do.balance){
	sce <- sce.subsample.per.group(sce,
	group.by = in.group,
	n.cells = subsample.to,
	min.cells = min.cells,
	allow.min = allow.min.cells,
	mc.ncores = mc.cores)
	}
	##

	umi <- assay(sce, "umi")
	sf <- colSums(umi)
	if(use.hvg) umi <- umi[get.info.genes(umi), ]

	group.labels <- colData(sce)[,in.group]
	are.ingroup <- group.labels %in% label

	test.genes <- rowSums( umi[,are.ingroup] > 0 ) > floor(frac.min * sum(are.ingroup))

	umi <- umi[test.genes, ]

	out <- pbmcapply::pbmclapply(rownames(umi), function(x){
	tibble(
	gene_id = x,
	log2fc = log2(mean( log1p(umi[x,are.ingroup] / sf[are.ingroup])) / mean(log1p(umi[x,!are.ingroup] / sf[!are.ingroup]))),
	p.value = wilcox.test(x = log1p(umi[x,are.ingroup] / sf[are.ingroup]), y = log1p(umi[x,!are.ingroup] / sf[!are.ingroup]), alternative = "greater" )$p.value
	)
	}, mc.cores = mc.cores) %>%
	do.call(rbind, .)

	out$p.adjust = p.adjust(out$p.value, method = "BH")

	if(!is.null(ncol(rowData(sce)))){
	if("gene_name" %in% colnames(rowData(sce))){
	out <- out %>% left_join(
	as.data.frame(rowData(sce)) %>%
	rownames_to_column("gene_id") %>%
	select("gene_id", "gene_name")
	)
	}
	}

	return(out %>% arrange(desc(log2fc)))
	}

	sce.all.markers <- function(sce,
	in.group,
	balance.group = NULL,
	do.balance = F,
	subsample.to = 500,
	allow.min.cells = F,
	min.cells = 25,
	mc.cores=10,
	use.hvg = T,
	frac.min=.25){

	lvls <- unique(colData(sce)[,in.group])

	out <- lapply(lvls, function(i){
	cat("Finding genes for ", i, "\n")
	sce.gene.enrichment(sce = sce,
	in.group = in.group,
	label = i,
	balance.group = balance.group,
	do.balance = do.balance,
	subsample.to = subsample.to,
	allow.min.cells = allow.min.cells,
	min.cells = min.cells,
	mc.cores=mc.cores,
	use.hvg = use.hvg,
	frac.min=frac.min)
	})
	names(out) <- lvls

	return(out)

	}

	sce.plot.along <- function(sce, val, along = "stage.ord", as.boxplot = F, as.violin = F,
	pointsize = .1, color=NULL){
	val.orig <- val

	has.rowdata <- ifelse(!is.null(ncol(rowData(sce))),
	ifelse("gene_name" %in% colnames(rowData(sce)),
	ifelse(val %in% rowData(sce)[,"gene_name"],
	T,
	F),
	F),
	F)

	if(length(val) == ncol(sce)){
	value = val
	}else if(val %in% rownames(sce)){
	value = log1p(assay(sce, "umi")[val, ] / colSums(assay(sce, "umi")))
	}else if( has.rowdata ){
	val <- rownames(rowData(sce))[which(rowData(sce)[,"gene_name"] == val)]
	print(val)
	value = log1p(assay(sce, "umi")[val, ] / colSums(assay(sce, "umi")))
	}else{
	stop(paste0("Could not find \"", val, "\" in the object!"))
	}

	if(!is.null(color)){
	df <- tibble(
	cell_id = colnames(sce),
	exprs = value
	) %>%
	left_join(
	as.data.frame(colData(sce)) %>%
	rownames_to_column("cell_id")
	) %>%
	select(exprs, along, color) %>%
	dplyr::rename(x = along,
	group = color)
	}else{
	df <- tibble(
	cell_id = colnames(sce),
	exprs = value
	) %>%
	left_join(
	as.data.frame(colData(sce)) %>%
	rownames_to_column("cell_id")
	) %>%
	select(exprs, along, color) %>%
	dplyr::rename(x = along)
	}


	plt <- df %>%
	ggplot(aes(x=x, y=exprs)) +
	xlab(along)

	if(as.boxplot){
	plt <- plt + geom_boxplot()
	}else if(as.violin){
	plt <- plt + geom_violin()
	}else if(!is.null(color)){
	plt <- plt + geom_point(aes(color=group),
	size=pointsize)
	}else{
	plt <- plt + geom_point(size=pointsize)
	}

	if(!is.numeric(colData(sce)[,along])){
	plt <- plt+
	theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust=.5))
	}

	return(plt)

	}

	sce.get.gene.exprs <- function(sce, gene, umi=F){
	out <- NULL
	gene.orig <- gene

	if(!is.null(ncol(rowData(sce)))){
	if("gene_name" %in% colnames(rowData(sce))){
	if(gene %in% rowData(sce)[,"gene_name"]){
	gene <- rownames(rowData(sce))[which(rowData(sce)[,"gene_name"] == gene)]
	}
	}
	}

	if(gene %in% rownames(sce)){
	if(!umi){
	out <- log1p(assay(sce, "umi")[gene,] / colSums(assay(sce, "umi")))
	}else{
	out <- assay(sce, "umi")[gene,]
	}

	}

	return(out)
	}

	sce.dotplot <- function(sce, genes, group.by = "cell_type"){
	tmp <- lapply(genes, function(g){

	exprs <- sce.get.gene.exprs(sce, g, umi = T)

	if(is.null(exprs)){
	return(NULL)
	}else{
	colData(sce) %>%
	as_tibble() %>%
	add_column(umi = exprs) %>%
	mutate(exprs = umi / colSums(assay(sce, "umi"))) %>%
	mutate(expresses = ifelse(umi > 0 , 1, 0)) %>%
	dplyr::rename(gr = group.by) %>%
	group_by(gr) %>%
	summarise(exprs = mean(exprs),
	expresses = sum(expresses) / n()) %>%
	add_column(gene = g) %>%
	mutate(exprs = ((exprs - min(exprs)) / max(exprs - min(exprs))))
	}

	}) %>% do.call(rbind, .)

	if(is.null(tmp)) stop("No genes found!\n")

	tmp %>%
	ggplot(aes(x=gene, y=gr)) +
	geom_point(aes(color=exprs, size=expresses)) +
	theme(axis.text.x = element_text(angle=90, hjust=1,vjust=.5)) +
	scale_color_continuous(name = "scaled mean expr.", low="gray", high="red") +
	scale_size_continuous(name="fraction pos.") +
	ylab(group.by)
	}