IdoBar · August 15, 2025 12:48 · IdoBar · Jun 27, 2018 · IdoBar · Jun 4, 2019
diff --git a/util.R b/util.R
 # util.R:
 # Utilities to make R a happier place
 # Originally created by Brendan O'Connor, brenocon.com/code
 # Edited by Ido Bar.
 # Source the file directly from this gist using `devtools::source_gist("7f63547158ecdbacf31b54a58af0d1cc", filename = "util.R")`


 ########################################
 ## Put everything into an environment, to not pollute global namespace
 util = new.env()

 ########################################
 ## Better I/O routines
 util$repath <- function(x, to=.Platform$OS.type) {
   xa <- ifelse(to=="windows", gsub('/', '\\\\', x), gsub('\\\\', '/', x))
  return(xa)
 }
 # search file recursively in a path
 util$search.path <- function(fn,
     paths = strsplit(chartr("\\", "/", Sys.getenv("RPATH")), split =
                switch(.Platform$OS.type, windows = ";", ":"))[[1]]) {
  for(d in paths)
     if (file.exists(f <- file.path(d, fn)))
        return(f)
  return(NULL)
 }
 # Find recent file in a folder (based on pattern to search)
 # Usage:  recent_file("./data/intermediate_files", glue::glue(".*{stacks_name}.+\\.clean.vcf"))
 util$recent_file <- function(filedir, pattern, sort_by="modified"){
  files <- file.info(dir(filedir, pattern, full.names = TRUE))
  files$filename <- rownames(files)
  # files[[paste0(substr(sort_by,1,1), "time")]] <- as.POSIXct(files[[paste0(substr(sort_by,1,1), "time")]]) 
  files[order(files[[paste0(substr(sort_by,1,1), "time")]] , decreasing = TRUE),"filename"][1]
 }
  

 # deprecated in favour of readr::read_tsv() function from tidyverse
 # util$read.tsv <- function(..., header=F, sep='\t', quote='', comment='', na.strings='', stringsAsFactors=FALSE) {
  # read.table() wrapper with default settings for no-nonsense, pure TSV
  # Typical use case is output from another program.
  # (R's defaults are more geared for human-readable datafiles, which is less
  # feasible for large-scale data anyway.)
  # These options are substantially faster than read.table() defaults.
  #   (see e.g. LINK)
  # stringsAsFactors is the devil.

 #   args = list(...)
 # args$header = headp
 #  if (!is.null(args$col.names)) {
    # read.delim() is not smart about this.  Yikes.
 #    args$header = FALSE
 #  }
 #  args$sep = sep
 #  args$quote = quote
 #  args$comment = comment
 #  args$stringsAsFactors = stringsAsFactors
 #  args$na.strings = na.strings
 #  do.call(read.delim, args)
 # }
 # deprecated in favour of readr::write_tsv() function from tidyverse
 # util$write.tsv <- function(..., header=NA, col.names=F, row.names=F, sep='\t', na='', quote=F) {
  # 'header' to 'col.names' naming consistency with read.table()
 #  if (is.finite(header)) col.names = header
 #  write.table(..., col.names=col.names, row.names=row.names, sep=sep, na=na, quote=quote)
 # }

 # write fasta files
 util$write_fasta <- function(id_seq_table, line_width=60, filename){
  file.create(filename)
  apply(id_seq_table, 1, function(entry) cat(sprintf(">%s\n%s\n\n", entry[1],  
                gsub(sprintf("([GTAC]{%i})",line_width), "\\1\n", entry[2])), 
                                             file = filename, append = TRUE))
  if (file.exists(filename)) return(TRUE)
  else {
     message(sprintf("Unable to create the file %s, please check that the path exists and is accessible", filename))
     return(FALSE)
        }
        
 }
     
 # improved implementation of xlsx::write.xlsx (allows overwrite of sheets and return the data as tibble, allowing it to be part of tidyverse pipes)
 util$write_xlsx <- function(df, excel_file, sheet="Sheet1", overwritesheet=FALSE, updatefile = TRUE, 
                             asTable=TRUE, return.tibble=TRUE,...){ 
  if (!'openxlsx' %in% row.names(installed.packages())) stop("Package 'openxlsx' is required and not available, please install it and try again")
  df <- as.data.frame(df)
  # if (file.exists(excel_file) & isTRUE(overwritefile)) unlink(excel_file) 
  if (!file.exists(excel_file)) {
    openxlsx::write.xlsx(df, excel_file, sheetName = sheet, rowNames=FALSE, asTable=asTable, ...)
    if (isTRUE(return.tibble) & ('tibble' %in% row.names(installed.packages()))) df <- tibble::as_tibble(df)
    return(invisible(df))
  } 
  if (file.exists(excel_file) & !isTRUE(updatefile)) stop(sprintf("File %s exists and 'updatefile' set to FALSE.", excel_file)) 
  wb <- openxlsx::loadWorkbook(excel_file)  
  sheets <- openxlsx::getSheetNames(excel_file)
  if (sheet %in% sheets){
    if (overwritesheet==FALSE) stop(sprintf("Sheet %s exists in file and 'overwritesheet' set to FALSE, please try again with a new sheet name or with 'overwritesheet=TRUE' option", sheet))
    openxlsx::removeWorksheet(wb, sheet)
  }
  openxlsx::addWorksheet(wb, sheet)
  if (asTable==TRUE){
   openxlsx::writeDataTable(wb, sheet, df)
  } else {
   openxlsx::writeData(wb, sheet, df)
  }
  openxlsx::saveWorkbook(wb, excel_file, overwrite = updatefile)
  # openxlsx::write.xlsx(df, excel_file, sheetName = sheet, row.names=FALSE, ...)
  if (isTRUE(return.tibble) & ('tibble' %in% row.names(installed.packages()))) df <- tibble::as_tibble(df)
  return(invisible(df))
 }

 # extract hyperlinks from excel sheets (see https://stackoverflow.com/a/70676505/5346827)
 util$hyperlinks_from_excel <- function(aExcelFile, ExcelSheet="all", 
                                  aRefOutputFile = NULL){
  req_pkgs <- c("XML", "purrr", "dplyr", "readxl")
  if (!'rlang' %in% row.names(installed.packages())) stop("'rlang' package is not installed, please install it and try again")
 #  if (!"rlang" %in% installed.packages() require(XML)) stop("XML package is not installed")
  rlang::check_installed(req_pkgs, 
                         reason = "Package XML is required to extract the urls from the sheets, while purrr and dplyr are needed to process the results")
  # ExcelSheet="submission_info"
 #  aExcelFile="sample_info/AGRF_NGS_yield.xlsx"
  sheets <- readxl::excel_sheets(aExcelFile)
  
  read_relationships <- function(aSheetIndex, tmp_dir){
    # tmp_dir= tmp_base_dir
    # aSheetIndex = 2
    filename <- file.path(tmp_dir, 'xl', 'worksheets', '_rels', paste0('sheet', aSheetIndex, '.xml.rels'))    
    rel <- XML::xmlParse(filename) %>% XML::xmlToList() %>% purrr::map_dfr(as.list) %>% 
      dplyr::filter(grepl("hyperlink$", Type)) %>% 
      dplyr::select(id=Id, target=Target) #%>% setNames()
      
    if(nrow(rel) == 0){
      return(NULL)
    }
    
    filename <- file.path(tmp_dir, 'xl', 'worksheets', paste0('sheet', aSheetIndex, '.xml'))    
    pos <- XML::xmlParse(filename) %>% XML::xmlToList() 
    
    if(is.null(pos$hyperlinks)){
      return(NULL)
    }
    
    ret <- purrr::map_dfr(pos$hyperlinks, as.list) %>% 
      # dplyr::select(-uid) %>% 
      dplyr::inner_join(rel, by = 'id') %>% 
      dplyr::mutate(tab_idx=aSheetIndex)
    
    link_text <- ret %>% dplyr::select(text=display, target) %>% 
      dplyr::filter(!is.na(text)) %>% 
      dplyr::distinct()
    
    return(ret %>% left_join(link_text))
  }
  
  EXCEL_TEMP_NAME <- 'unzipped_excel'
  
  tmp_base_dir <- file.path(tempdir(),
                            paste0('tmpexcl', 
                                   as.character(round(runif(1, 1000000000000, 9999999999999)))))
  dir.create(tmp_base_dir)
  on.exit(unlink(tmp_base_dir))
  
  zipfile <- file.path(tmp_base_dir, paste0(EXCEL_TEMP_NAME, '.zip'))
  file.copy(from = aExcelFile, to = zipfile)
  unzip(zipfile, exdir = tmp_base_dir)
  
  sheet_idx <- NULL
  # ExcelSheet=sheets[1]
  if (is.numeric(ExcelSheet) & all(ExcelSheet %in% seq_along(sheets))) {
    sheet_idx <- ExcelSheet
  }  else if (is.character(ExcelSheet) & all(ExcelSheet %in% sheets)) {
    sheet_idx <- which(sheets %in% ExcelSheet)
  } else if (tolower(ExcelSheet)=="all") {
    sheet_idx <- seq_along(sheets)
  }
   
 
  if (length(sheet_idx)==0) stop("Sheets provided to 'ExcelSheet' argument not present in the Excel file")
    
  res <- purrr::map_dfr(sheet_idx, ~read_relationships(.x, tmp_base_dir)) %>% 
    dplyr::mutate(sheet = sheets[tab_idx]) %>% 
    dplyr::mutate(ref = paste0("'", sheet, "'!", ref)) %>% 
    dplyr::select(id,tab_idx, sheet, ref, target, text)
 
  
  if(!is.null(aRefOutputFile)){
    write_csv(res, aRefOutputFile)
  }
  
  return(res)
 }


 ########################################
 ##  Misc small routines
  # calculates standard error
 util$se <- function(x, na.rm=TRUE) {
   if(na.rm) return(sqrt(var(x, na.rm=TRUE)/length(x[!is.na(x)]))) else return(sqrt(var(x)/length(x)))
 } 
 # calculate geometric mean (see https://stackoverflow.com/a/25555105/5346827)
 util$gm_mean = function(x, na.rm=TRUE){
  exp(sum(log(x[x > 0]), na.rm=na.rm) / length(x))
 }
        
 util$as.c <- as.character
 # split vector into chunks (see https://stackoverflow.com/questions/3318333/split-a-vector-into-chunks)
 util$chunk <- function(x, n) (mapply(function(a, b) (x[a:b]), seq.int(from=1, to=length(x), by=n), pmin(seq.int(from=1, to=length(x), by=n)+(n-1), length(x)), SIMPLIFY=FALSE))        
 util$chunk2 <- function(x,n) split(x, cut(seq_along(x), n, labels = FALSE))
                                     
 util$unwhich <- function(indices, len=length(indices)) {
  # reverse of which(): from indices to boolean mask.
  ret = rep(F,len)
  ret[indices] = T
  ret
 }

 util$nna <- function(...) !is.na(...)   # i type this a lot, i think its worth 3 characters + shift key

 util$shuffle <- function(...) UseMethod("shuffle")

 util$shuffle.default <- function(x)  x[order(runif(length(x)))]

 util$shuffle.data.frame <- function(x)  x[order(runif(nrow(x))),]

 util$sample_df <- function(d, size=10, ...)  {
  samp = sample(1:nrow(d), size=size, ...)
  d[samp,]
 }

 # split a dataframe into subsets by size
 util$split_df <- function(df, group_size){
  split(df, gl(ceiling(nrow(df)/group_size), group_size, nrow(df)))
 }

 util$present_levels <- function(x) intersect(levels(x), x)

 util$trim_levels <- function(...) UseMethod("trim_levels")

 util$trim_levels.factor <- function(x)  factor(x, levels=present_levels(x))

 util$trim_levels.data.frame <- function(x) {
  for (n in names(x))
    if (is.factor(x[,n]))
      x[,n] = trim_levels(x[,n])
  x
 }

 util$unfactor <- function(df){
  id <- sapply(df, is.factor)
  df[id] <- lapply(df[id], as.character)
  df
 }
        
 # grep() returns indices of matches.  Variants:

 util$bgrep <- function(pat,x, ...) {
  # "boolean" grep: return a logical vector ready for vector ops
  # like & |  and others
  unwhich(grep(pat,x,...), length(x))
 }

 util$ngrep <- function(pat,x, ...)
  # "normal" grep: return values, not indices
  x[grep(pat,x,...)]


 ########################################
 ##  Other data manipulation routines
 util$tidy_transpose <- function(df) {   
   df %>% t() %>%   # Tranpose, but function is for matrices. Return Matrix     
   as.data.frame() %>%   #Force to be dataframe      
   tibble::rownames_to_column(var = "rowname") %>%  #Resave first column from rownames      
   janitor::row_to_names(row_number = 1) #Resave column headers from first row.  
 }                                     
  
 util$tapply2 <- function(x, ...) {
  # like tapply but preserves factors
  if (is.factor(x)) {
    r = factor(tapply(as.character(x), ...), levels=levels(x))
  } else {
    r = tapply(x, ...)
  }
  r
 }

 util$inject <- function(collection, start, fn) {
  # like lisp reduce.  (named after ruby)
  acc = start
  for (x in collection)
    acc = fn(acc, x)
  acc
 }


 ##### Bioinformatics Annotation #######
  # Shortens and "prettify" long BLAST descriptions  
 util$pretty_term <- function(term, prefix=c("PRED:", "Predicted:"), comma=FALSE, dot=FALSE, paren=FALSE, cap=TRUE, firstCapOnly=FALSE, 
                             spDot=TRUE, charNum=0, showTrimmed=NULL) {
  term <- sapply(term, function (s) {
    s_ <- s
    prefix_search_regex <- paste(paste0(prefix, "\\s*"), collapse = "|")
    if (is.character(prefix_search_regex) && length(prefix_search_regex)>0) s_ <- sub(prefix_search_regex ,"", s_, ignore.case = TRUE, perl=TRUE)
    if (cap) s_ <- sub("(^.)", "\\U\\1", s_, perl=TRUE)
    if (firstCapOnly) s_ <- paste(c(unlist(strsplit(s_, " ", fixed = TRUE))[1], sapply(unlist(strsplit(s_, " ", fixed = TRUE))[-1], function(n) sub("(^.)", "\\L\\1", n, perl=TRUE), USE.NAMES = FALSE)), collapse = " ")
    if (spDot) s_ <- sub("sp\\. (.+?)\\.(.+)", "sp. \\1_\\2", s_, perl=TRUE)
    if (paren) s_ <- sub(" \\(.+\\)", "",  s_)
    if (dot) s_ <- unlist(strsplit(s_, ".", fixed = TRUE))[1]
    if (comma) s_ <- unlist(strsplit(s_, ",", fixed = TRUE))[1]
    # Trim to the end of the word if character number limit is reached and remove any trailing commas or dots
    if (charNum>0) s_ <- sub("[,|\\.]$", "", sub(sprintf("(^.{%s}[\\S]*)[ ]*.*", charNum), "\\1", s_, perl=TRUE), perl = TRUE)
    # Add a ^ at the end if the term was trimmed anyhow
    if (nchar(s_)<nchar(s) && !grepl("\\*$", s_) && !is.null(showTrimmed)) return(paste0(s_, as.character(showTrimmed)))
    else return(s_)
  }, USE.NAMES = FALSE)
  return(term)
 }
  
 # Find unique Tax_name
 util$find_uniq_tax <- function(desc, taxName, prettify=FALSE) {
  if (is.na(taxName) || taxName=="N/A")  uniqTax <- ifelse((is.na(desc) || !grepl(".+\\[(.*)\\]", desc)), NA, sub(".+\\[(.*)\\]","\\1", desc))
  else {
    if (grepl(";", taxName)) {
      taxList <- unlist(strsplit(taxName, ";"))
      taxDesc <- sub(".+\\[(.*)\\]","\\1", desc)
      uniqTax <- ifelse(taxDesc %in% taxList, taxDesc, taxList[length(taxList)])

    } else uniqTax <- taxName
  }
  if (prettify) uniqTax <- pretty_term(uniqTax, prefix="Undetermined", comma=FALSE, dot=FALSE, paren=FALSE, cap=TRUE, spDot=TRUE, charNum=0, showTrimmed=FALSE)
  return(uniqTax)
 } 

 # Resolve scientific name (with taxize package)
 util$resolve_scientific <- function(tax_names) {
  tax_count=0
  resolved <- lapply(tax_names, function(x) {
    tax_count <<- tax_count + 1
    perc <- tax_count/length(tax_names)*100
    message(sprintf("Processing %s (%.1f%% completed)", x, perc))
    res_name <- try(taxize::gnr_resolve(names = x, data_source_ids=4, canonical = TRUE,best_match_only=FALSE ), TRUE)
    if (!inherits(res_name, "try-error")) return(res_name$results[which.max(res_name$results$score),])
    else {
      message(sprintf("Warning: Could not process %s correctly, returning NAs", x))
      return(data.frame(submitted_name=x,matched_name=NA,data_source_title=NA,score=NA))
    }
  } )
  return(resolved)
 }

 # Define a function that accepts a matrix of DE contrasts and DESeq data to analyse each contrast
 util$get_deseq2_res <- function(dds_deseq2, mat_col, group_col = "sample_group") {
  contrast <- c(group_col,rev(mat_col))
  comparison <- paste0(rev(mat_col), collapse="_vs_")
  res <- DESeq2::results(dds_deseq2, contrast=contrast, tidy = TRUE) %>% as_tibble() %>%
    dplyr::rename(gene_id=row) %>%
    dplyr::mutate(contrast=comparison, analysis_date=analysis_date) %>%
    dplyr::filter(!is.na(padj)) %>% dplyr::arrange(padj, desc(abs(log2FoldChange)))
  return(res)
 }

 # Modify plotPCA to return all 4 PCs

 util$plotPCA4 <- function (dds_deseq2, intgroup = "condition", ntop = 500, PCs=3:4, returnData = FALSE){
  rv <- genefilter::rowVars(SummarizedExperiment::assay(dds_deseq2))
  select <- order(rv, decreasing = TRUE)[seq_len(min(ntop,
                                                     length(rv)))]
  pca <- prcomp(t(SummarizedExperiment::assay(dds_deseq2)[select, ]))
  percentVar <- pca$sdev^2/sum(pca$sdev^2)
  if (!all(intgroup %in% names(SummarizedExperiment::colData(dds_deseq2)))) {
    stop("the argument 'intgroup' should specify columns of colData(dds)")
  }
  intgroup.df <- as.data.frame(SummarizedExperiment::colData(dds_deseq2)[, intgroup,
                                               drop = FALSE])
  group <- if (length(intgroup) > 1) {
    factor(apply(intgroup.df, 1, paste, collapse = " : "))
  }
  else {
    SummarizedExperiment::colData(dds_deseq2)[[intgroup]]
  }
  d <- data.frame(PC1 = pca$x[, 1], PC2 = pca$x[, 2], PC3 = pca$x[, 3], PC4 = pca$x[, 4], group = group,
                  intgroup.df, name = colnames(dds_deseq2))
  if (returnData) {
    attr(d, "percentVar") <- percentVar[1:4]
    return(d)
  }
  ggplot2::ggplot(data = d, aes_string(x = paste0("PC", PCs[1]), y = paste0("PC", PCs[2]), color = "group")) + 
   ggplot2::geom_point(size = 3) +
    ggplot2::labs(x = sprintf("PC%d: %.2f%% variance", PCs[1], percentVar[PCs[1]]*100), 
         y = sprintf("PC%d: %.2f%% variance", PCs[2], percentVar[PCs[2]]*100)) + ggplot2::coord_fixed()
   # + ylab(paste0("PC4: ", round(percentVar[2] * 100), "% variance")) 
 }

 # blast headers (see https://www.metagenomics.wiki/tools/blast/blastn-output-format-6)
 util$blast_std_headers <- c("qseqid", "sseqid", "pident", "length", "mismatch", "gapopen", 
                   "qstart", "qend", "sstart", "send", "evalue", "bitscore")
 # blast extended headers (with titles and taxonomy)
 util$blast_tax_headers <- c("qseqid", "sseqid", "pident", "length", "mismatch", "gapopen", 
                   "qstart", "qend", "sstart", "send", "evalue", "bitscore",
                   'stitle', 'staxids', 'ssciname', 'scomname')
 # blast extended headers (with titles and taxonomy)
 util$dmnd_tax_headers <- c("qseqid", "sseqid", "pident", "length", "mismatch", "gapopen", 
                           "qstart", "qend", "sstart", "send", "evalue", "bitscore", "stitle", 
                           "staxids", "sscinames", "skingdoms", "sphylums")
 util$interpro_headers <- c("protein","md5","length","analysis","accession","description","start","stop","score","status",
                           "date","interpro_accession","interpro_description","go_annotation","pathway_annotation") 
                                    
 # filter vcf by sample
 util$filter_vcfr_samples <- function(vcf, include_samples, exclude_samples) {
  require(vcfR)
  rlang::check_exclusive(include_samples, exclude_samples)
  sample_names <- colnames(vcf@gt)[-1]
  if (missing(include_samples)) {
    include_samples <- sample_names[!sample_names %in% exclude_samples]
  }
  return(vcf[,c('FORMAT', include_samples)])
  
 }
 # filter vcfR samples by pattern
 util$grep_vcfr_samples <- function(vcf, pattern, invert = FALSE) {
  require(vcfR)
  sample_names <- colnames(vcf@gt)[-1]
  return(vcf[,c('FORMAT', grep(pattern, x = sample_names, 
                               value = TRUE, invert = invert))])
  
 }
 ########################################
 ## Printing, viewing
 ## see also:  str()
        
 ## Transparent colors
 ## Mark Gardener 2015
 ## www.dataanalytics.org.uk

 util$t_col <- function(color, percent = 50, name = NULL) {
 #	  color = color name, percent = % transparency, name = an optional name for the color
 ## Get RGB values for named color
  rgb.val <- col2rgb(color)
 ## Make new color using input color as base and alpha set by transparency
  t.col <- rgb(rgb.val[1], rgb.val[2], rgb.val[3],
               max = 255,
               alpha = (100-percent)*255/100,
               names = name)
 ## Save the color
  invisible(t.col)
  }

 util$printf <- function(...) cat(sprintf(...))

 util$listprint <- function(x) {
  s = paste(sapply(names(x), function(n)  sprintf("%s=%s", n,x[[n]])), collapse=' ')
  printf("%s\n", s)
 }

 util$msg <- function(...)  cat(..., "\n", file=stderr())
   
 util$LogMsg <- function(text, dateformat="%d-%m-%Y %I:%M:%S",  ...) {
   if (is.null(dateformat) | (dateformat==FALSE)) return(cat(paste0(text, "\n")))
  msg <- sprintf(paste0(format(Sys.time(), dateformat), ": ", text, "\n"), ...)
  cat(msg)
 }

 util$h = utils::head

 util$ppy <- function(x, column.major=FALSE, ...) {
  # pretty-print as yaml.  intended for rows with big textual cells.
  # a la mysql's \G operator
  library(yaml)
  cat(as.yaml(x, column.major=column.major), ...)
  cat("\n", ...)
 }

 util$table_html = function(...) {
  # Intended for inside dosink()
  columns = list(...)
  ncol = length(columns)
  nrow = length(columns[[1]])
  # assume columns are in parallel
  printf("\n<table cellpadding=3 border=1 cellspacing=0 bordercolor=gray>")
  for (i in 1:nrow) {
    printf("\n<tr>")
    for (j in 1:ncol)
      printf("\n  <td>%s", columns[[j]][i])
  }
  printf("\n</table>\n")
 }


 ########################################
 ##  Workspace management
 # Fix devtools
 util$.fixdevtools <- function() {
   dev_reqs <- c("httr", "devtools")
   install.deps(dev_reqs)
   httr::set_config( httr::config( ssl_verifypeer = 0L ) ) 
   options(buildtools.check = function(action) TRUE )
   }
 # Silently load a package, can change mode to "require"
 util$silent.load <- function(a.package, mode="library"){
   eval(parse(text=sprintf("suppressWarnings(suppressPackageStartupMessages(
    %s(a.package, character.only=TRUE)))", mode)))
 }
           
 # install and load packages from CRAN, Bioconductor or git, see usage below
 util$install.deps <- function(p, repo="cran", reposite=getOption("repos"), silent=FALSE, force=FALSE){
  # repos=getOption("repos")
  if (is.null(reposite)) reposite <- "http://cloud.r-project.org/"
  
  call_install <- switch(repo,
              cran=sprintf("install.packages(package, repos=\"%s\"", reposite),
                         bioc="biocLite(package, suppressUpdates=TRUE, ask=FALSE",
                         git="install_github(package")
  
  if (repo=="git") .fixdevtools()
  if (repo=="bioc" & !all(p %in% utils::installed.packages())) {
    #source("http://bioconductor.org/biocLite.R")
    
    if (!"RCurl" %in% utils::installed.packages()) {
      install.packages("RCurl", repos=reposite)
      
    }
    
    eval(parse(text = RCurl::getURL("http://bioconductor.org/biocLite.R",
                             ssl.verifypeer=FALSE)))
    biocLite(suppressUpdates=TRUE, ask=FALSE)
  }
  for (package in p) {
    gitNames <- unlist(strsplit(package, "/"))
    pName <- gitNames[length(gitNames)]
    if (!pName %in% utils::installed.packages() | isTRUE(force)) {
      cat(sprintf("Please wait, installing and loading missing package %s and its dependencies from %s.\n",
                  package, repo), file=stderr())
      
      suppressWarnings(eval(parse(text = sprintf("%s, quiet = TRUE, lib=.libPaths()[1])",call_install))))
      if (!pName %in% utils::installed.packages()) {
        ifelse(!interactive(),q("no", 2, TRUE),
               stop(sprintf("Unable to install missing package %s from %s.\n",
                            package, repo), call. = FALSE))
      }
    }
    if (isTRUE(silent)) silent.load(pName, mode="require")  
      else require(pName, character.only=T, quietly=TRUE, warn.conflicts=TRUE)
  }
 }

 # CRAN_packages <- c("RCurl", "dplyr", "tidyr", "ggplot2", "RColorBrewer", "RSQLite")
  # install.deps(CRAN_packages)

  
 # improved list of objects
 # http://stackoverflow.com/questions/1358003/tricks-to-manage-the-available-memory-in-an-r-session
 util$list_objects = function (pos = 1, pattern) {
    napply = function(names, fn) sapply(names, function(x)
                                         fn(get(x, pos = pos)))
    names = ls(pos = pos, pattern = pattern)
    N = length(names)
    obj_class = napply(names, function(x) as.character(class(x))[1])
    obj_mode = napply(names, mode)
    obj_type = ifelse(is.na(obj_class), obj_mode, obj_class)
    obj_prettysize = napply(names, function(x) {
                           capture.output(print(object.size(x), units = "auto")) })
    obj_size = napply(names, object.size)
    obj_prettysize[obj_size < 1e6] = ""

    obj_length = napply(names, function(x) length(x))
    obj_dim = t(napply(names, function(x)
                        as.numeric(dim(x))[1:2]))

    is_flat = is.na(obj_dim)[, 1]
    is_vector = napply(names, function(x) is.vector(x) & class(x) != 'list')


    info_width = max(20, options('width')$width - 60)

    small_str = function(x) {
      out = capture.output(
        str(x, max.level=0, give.attr=F, give.head=F, width=info_width, strict.width='cut')
      )
      out = str_c(out,collapse=' ')
      out = cutoff(str_replace(out,"\n"," "))
      if (str_detect(out, "^List of"))
        out = str_c("[Names] $ ", str_c(names(x),collapse=' '))
      cutoff(out)
    }

    cutoff = function(s) {
      if (str_length(s) >= info_width) {
        str_c(str_sub(s,1,info_width-2),'..')
      } else {
        s
      }
    }
      
    pad = function(s) sprintf(" %s", s)

    out <- data.frame(
      Type = obj_type,
      Size = obj_prettysize,
      Dim = ifelse(is_vector | is_flat, obj_length, 
        sprintf("(%s, %s)", obj_dim[,1], obj_dim[,2])),
      Value = napply(names, function(x) 
        if (class(x) %in% c('data.frame','list') && !is.null(names(x)))
          cutoff(str_c("[Names] $ ",str_c(names(x), collapse=' ')))
        else small_str(x)
        ),
      stringsAsFactors=F)
    row.names(out) = names
    out$Dim = sprintf(" %s", out$Dim)
    out$Value = sprintf(str_c(" %-", info_width, "s"), out$Value)

    out = rbind(subset(out, Type!='function'), subset(out, Type=='function'))
    out
 }

 util$lsos = function() {
  d = list_objects()
  d$name = row.names(d)
  d = subset(d, name != 'util')
  row.names(d)=d$name
  d$name=NULL
  d
 }


 ########################################
 ## For performance optimization and long-running jobs

 util$timeit <- function(expr, name=NULL) {
  # print how long the expression takes, and return its value too.
  # So you can interpose   timeit({ blabla })   around any chunk of code "blabla".
  start = Sys.time()
  ret = eval(expr)
  finish = Sys.time()
  if (!is.null(name)) cat(name,": ")
  print(finish-start)
  invisible(ret)
 }

 util$dotprogress <- function(callback, interval=10) {
  # intended to wrap the anonymous callback for sapply() or somesuch.
  # ALTERNATIVE: plyr *ply(.progress='text')
  count = 0
  return(function(...) {
    if ((count <<- count+1) %% interval == 0)
      cat(".")
    callback(...)
  })
 }


 ########################################
 ##  RStudio recommended addins
 util$rstudio_addins <- c( "paleolimbot/rbbt", "milesmcbain/datapasta", "addinslist", "jennybc/jadd", 
                         "haozhu233/giphyr","gadenbuie/regexplain",  'hippie', "ShinyQuickStarter","annotater",
                         "ThinkR-open/remedy", "styler",  'konradzdeb/extraInserts', "gadenbuie/ermoji")
                         
               
 util$rstudio_dev_addins <-c("matt-dray/snorkel","crsh/citr", "testthis", "todor","rdoxygen","sinew","mdlincoln/docthis")



 ########################################
 ## Graphics output wrappers
 ## For easy one-liners, like:
 ## dopdf("tmp.pdf",width=5,height=5,cmd=plot(x,y))

 util$dopdf <- function(filename,..., cmd) {
  pdf(filename, ...)
  eval(cmd)
  dev.off()
  if (exists('OPEN') && OPEN)
    system(sprintf("open %s", filename))
 }

 util$dopng <- function(filename,..., cmd) {
  png(filename, ...)
  eval(cmd)
  dev.off()
  if ((exists('OPEN') && OPEN))
    system(sprintf("open %s", filename))
 }

 util$dosink <- function(filename,cmd, open=NULL) {
  # like capture.output() but follows open/OPEN conventions here
  sink(filename)
  eval(cmd)
  sink(NULL)
  if (prio_check(open, exists('OPEN') && OPEN))
    system(sprintf("open %s", filename))
 }

 util$dosvg <- function(filename, ..., cmd, open=NULL) {
  require("RSvgDevice")
  devSVG(filename, ...)
  eval(cmd)
  dev.off()
  if (prio_check(open, exists('OPEN') && OPEN))
    system(sprintf("open %s", filename))
 }

 # An R function to save pheatmap figure into pdf
 # This was copied from Stackflow: https://stackoverflow.com/questions/43051525/how-to-draw-pheatmap-plot-to-screen-and-also-save-to-file


 util$save_pheatmap_pdf <- function(x, filename, width=7, height=7) {
    stopifnot(!missing(x))
    stopifnot(!missing(filename))
    pdf(filename, width=width, height=height)
    grid::grid.newpage()
    grid::grid.draw(x$gtable)
    dev.off()
 }
       
 # A default plotting theme       
 util$plot_theme <-  function(def_theme="grey", baseSize=22, strip_fill="lightskyblue"){
  require("ggplot2")
  my_theme <- eval(parse(text = sprintf("theme_%s(base_size=%d)", def_theme,baseSize)))  +
    theme(axis.title.y=element_text(face="bold", vjust = 1.5, size=rel(0.8)),
          axis.title.x=element_text(face="bold", vjust = 0.1, size=rel(0.8)),
          legend.title=element_text(size=rel(0.85), hjust=0.5),
          legend.text=element_text(size = rel(0.75),lineheight = 1.5),
          axis.text.y=element_text(size=rel(0.85)),
          axis.text.x=element_text(size=rel(0.85)),
          #panel.grid.minor=element_blank(),
          strip.text=element_text(size=rel(0.75)),
          strip.background=element_rect(fill = strip_fill))
  return(my_theme)
 }

 # plot correlation matrix (https://statsandr.com/blog/correlation-coefficient-and-correlation-test-in-r/)
 # do not edit
 util$corrplot2 <- function(data,
                      method = "pearson",
                      sig.level = 0.05,
                      order = "original",
                      diag = FALSE,
                      type = "upper",
                      tl.srt = 90,
                      number.font = 1,
                      number.cex = 1,
                      mar = c(0, 0, 0, 0)) {
  require("corrplot")
  data_incomplete <- data
  data <- data[complete.cases(data), ]
  mat <- cor(data, method = method)
  cor.mtest <- function(mat, method) {
    mat <- as.matrix(mat)
    n <- ncol(mat)
    p.mat <- matrix(NA, n, n)
    diag(p.mat) <- 0
    for (i in 1:(n - 1)) {
      for (j in (i + 1):n) {
        tmp <- cor.test(mat[, i], mat[, j], method = method)
        p.mat[i, j] <- p.mat[j, i] <- tmp$p.value
      }
    }
    colnames(p.mat) <- rownames(p.mat) <- colnames(mat)
    p.mat
  }
  p.mat <- cor.mtest(data, method = method)
  col <- colorRampPalette(c("#BB4444", "#EE9988", "#FFFFFF", "#77AADD", "#4477AA"))
  corrplot(mat,
    method = "color", col = col(200), number.font = number.font,
    mar = mar, number.cex = number.cex,
    type = type, order = order,
    addCoef.col = "black", # add correlation coefficient
    tl.col = "black", tl.srt = tl.srt, # rotation of text labels
    # combine with significance level
    p.mat = p.mat, sig.level = sig.level, insig = "blank",
    # hide correlation coefficients on the diagonal
    diag = diag
  )
 }       
       
 # This function makes a pretty ggplot theme (https://ourcodingclub.github.io/tutorials/dataviz-beautification-synthesis/#distributions)
 # This function takes no arguments 
 # meaning that you always have just niwot_theme() and not niwot_theme(something else here)

 util$theme_niwot <- function(baseSize=16){
  theme_bw(base_size=baseSize) +
    theme(text = element_text(family = "Helvetica Light"),
          axis.text = element_text(size = rel(0.85)), 
          # axis.title = element_text(size = 18),
          axis.line.x = element_line(color="black"), 
          axis.line.y = element_line(color="black"),
          panel.border = element_blank(),
          panel.grid.major.x = element_blank(),                                          
          panel.grid.minor.x = element_blank(),
          panel.grid.minor.y = element_blank(),
          panel.grid.major.y = element_blank(),  
          plot.margin = unit(c(1, 1, 1, 1), units = , "cm"),
          plot.title = element_text(vjust = 1, hjust = 0),
          legend.text = element_text(size = rel(0.85)),          
          legend.title = element_blank(),                              
          legend.position = c(0.95, 0.15), 
          legend.key = element_blank(),
          legend.background = element_rect(color = "black", 
                                           fill = "transparent", 
                                           size = 2, linetype = "blank"))
 }       
 # Create a filename with folder and current date/time before the extension
 util$filedate <- function(filename, ext="pdf", outdir=".", dateformat="%d_%m_%Y"){
  if (outdir!="." & !file.exists(outdir)) dir.create(outdir, recursive = TRUE)
  ext <- sub("\\.*(.+)", "\\1", ext)
  if (is.null(dateformat)) return(file.path(outdir, sprintf("%s.%s", filename, ext)))
  if (dateformat==FALSE) return(file.path(outdir, sprintf("%s.%s", filename, ext)))
  return(file.path(outdir, sprintf("%s_%s.%s", filename, format(Sys.time(), dateformat), ext)))
 }
   
 ########################################
 ## Plotting routines

 # Create nice log axis breaks (see package "scales")
 util$log_breaks <- function(n = 10, minor=TRUE, major=TRUE, pretty=FALSE){
  function(x) {
    b <- axisTicks(log10(range(x, na.rm = TRUE)), log = TRUE, n = n)
    if (!minor) b <- b[grepl("1", prettyNum(b))]
    if (!major) b <- b[grepl("5", prettyNum(b))]
    if (pretty) b <- prettyNum(b)
    return(b)
  }
 }
 # Create log axis labels
 util$base_breaks <- function(n = 10){
  function(x) {
    axisTicks(log10(range(x, na.rm = TRUE)), log = TRUE, n = n)
  }
 }
 # A function factory for getting integer y-axis values.
 util$integer_breaks <- function(n = 5, ...) {
  fxn <- function(x) {
    breaks <- floor(pretty(x, n, ...))
    names(breaks) <- attr(breaks, "labels")
    breaks
  }
  return(fxn)
 }
 util$linelight <- function(x,y, lty='dashed', col='lightgray', ...) {
  # highlight a point with lines running to the axes.
  left = par('usr')[1]
  bot = par('usr')[3]
  segments(left,y, x,y, lty=lty, col=col, ...)
  segments(x,bot,  x,y, lty=lty, col=col, ...)
 }
      
 util$hintonplot <- function(mat, max_value=max(abs(mat)), mid_value=0, ...) {
  # Plots a matrix as colored, size-varying boxes
  # I dunno who started calling this a "Hinton plot", but anyways

  # Example:
  # hintonplot(matrix(rnorm(100),10))

  # Example, for counts:
  # table(cyl=mtcars$cyl, mpg=cut(mtcars$mpg,3))
  #    mpg
  # cyl (10.4,18.2] (18.2,26.1] (26.1,33.9]
  #   4           0           6           5
  #   6           2           5           0
  #   8          12           2           0
  # hintonplot(table(cyl=mtcars$cyl, mpg=cut(mtcars$mpg,3)))

  plot.new()
  plot.window(xlim=c(0.5,ncol(mat)+0.5), ylim=c(0.5,nrow(mat)+0.5))

  x_mid = 1:ncol(mat)
  y_mid = 1:nrow(mat)

  area = abs(mat) / max_value
  side = sqrt(area)

  for (x in 1:ncol(mat)) {
    for (y in nrow(mat):1) {
      # ym = (nrow(mat):1)[y]
      ym = y
      d = side[ym,x] / 2
      rect(x-d, y-d, x+d, y+d, col=if (mat[ym,x]>0) 'darkblue' else 'darkred')
    }
  }

  axis(1, 1:ncol(mat), labels=colnames(mat))
  # axis(2, nrow(mat):1, labels=row.names(mat))
  axis(2, 1:nrow(mat), labels=row.names(mat))
  title(xlab=names(dimnames(mat))[2], ylab=names(dimnames(mat))[1], ...)
 }



 ########################################
 ## Rmarkdown functions
 # make captions for DT tables                 
 util$DT_caption <- function(ref, caption, tab_width="100%"){
  # Do not use underscores or spaces in references!
  if (grepl("[_ ]", ref)) stop("Do not use underscores (_) or spaces ( ) in figure or table reference!")
  cat(sprintf("<table width=%s>",tab_width) ,
      sprintf("<caption>(#tab:%s)%s</caption>", ref, caption),"</table>", 
      sep ="\n")
 }


 ########################################
 ## Has to be last in file

 while("util" %in% search())
  detach("util")
 attach(util)
No results found