Skip to content

Instantly share code, notes, and snippets.

@IdoBar
Last active October 31, 2024 02:14
Show Gist options
  • Save IdoBar/7f63547158ecdbacf31b54a58af0d1cc to your computer and use it in GitHub Desktop.
Save IdoBar/7f63547158ecdbacf31b54a58af0d1cc to your computer and use it in GitHub Desktop.
A set of utilities to make easier R working environment
# util.R:
# Utilities to make R a happier place
# Originally created by Brendan O'Connor, brenocon.com/code
# Edited by Ido Bar.
# Source the file directly from this gist using `devtools::source_gist("7f63547158ecdbacf31b54a58af0d1cc", filename = "util.R")`
########################################
## Put everything into an environment, to not pollute global namespace
util = new.env()
########################################
## Better I/O routines
util$repath <- function(x, to=.Platform$OS.type) {
xa <- ifelse(to=="windows", gsub('/', '\\\\', x), gsub('\\\\', '/', x))
return(xa)
}
# search file recursively in a path
util$search.path <- function(fn,
paths = strsplit(chartr("\\", "/", Sys.getenv("RPATH")), split =
switch(.Platform$OS.type, windows = ";", ":"))[[1]]) {
for(d in paths)
if (file.exists(f <- file.path(d, fn)))
return(f)
return(NULL)
}
# Find recent file in a folder (based on pattern to search)
# Usage: recent_file("./data/intermediate_files", glue::glue(".*{stacks_name}.+\\.clean.vcf"))
util$recent_file <- function(filedir, pattern, sort_by="modified"){
files <- file.info(dir(filedir, pattern, full.names = TRUE))
files$filename <- rownames(files)
# files[[paste0(substr(sort_by,1,1), "time")]] <- as.POSIXct(files[[paste0(substr(sort_by,1,1), "time")]])
files[order(files[[paste0(substr(sort_by,1,1), "time")]] , decreasing = TRUE),"filename"][1]
}
# deprecated in favour of readr::read_tsv() function from tidyverse
# util$read.tsv <- function(..., header=F, sep='\t', quote='', comment='', na.strings='', stringsAsFactors=FALSE) {
# read.table() wrapper with default settings for no-nonsense, pure TSV
# Typical use case is output from another program.
# (R's defaults are more geared for human-readable datafiles, which is less
# feasible for large-scale data anyway.)
# These options are substantially faster than read.table() defaults.
# (see e.g. LINK)
# stringsAsFactors is the devil.
# args = list(...)
# args$header = headp
# if (!is.null(args$col.names)) {
# read.delim() is not smart about this. Yikes.
# args$header = FALSE
# }
# args$sep = sep
# args$quote = quote
# args$comment = comment
# args$stringsAsFactors = stringsAsFactors
# args$na.strings = na.strings
# do.call(read.delim, args)
# }
# deprecated in favour of readr::write_tsv() function from tidyverse
# util$write.tsv <- function(..., header=NA, col.names=F, row.names=F, sep='\t', na='', quote=F) {
# 'header' to 'col.names' naming consistency with read.table()
# if (is.finite(header)) col.names = header
# write.table(..., col.names=col.names, row.names=row.names, sep=sep, na=na, quote=quote)
# }
# write fasta files
util$write_fasta <- function(id_seq_table, line_width=60, filename){
file.create(filename)
apply(id_seq_table, 1, function(entry) cat(sprintf(">%s\n%s\n\n", entry[1],
gsub(sprintf("([GTAC]{%i})",line_width), "\\1\n", entry[2])),
file = filename, append = TRUE))
if (file.exists(filename)) return(TRUE)
else {
message(sprintf("Unable to create the file %s, please check that the path exists and is accessible", filename))
return(FALSE)
}
}
# improved implementation of xlsx::write.xlsx (allows overwrite of sheets and return the data as tibble, allowing it to be part of tidyverse pipes)
util$write_xlsx <- function(df, excel_file, sheet="Sheet1", overwritesheet=FALSE, updatefile = FALSE,
asTable=TRUE, return.tibble=TRUE,...){
if (!'openxlsx' %in% row.names(installed.packages())) stop("Package 'openxlsx' is required and not available, please install it and try again")
df <- as.data.frame(df)
# if (file.exists(excel_file) & isTRUE(overwritefile)) unlink(excel_file)
if (!file.exists(excel_file)) {
openxlsx::write.xlsx(df, excel_file, sheetName = sheet, rowNames=FALSE, asTable=asTable, ...)
if (isTRUE(return.tibble) & ('tibble' %in% row.names(installed.packages()))) df <- tibble::as_tibble(df)
return(invisible(df))
}
if (file.exists(excel_file) & !isTRUE(updatefile)) stop(sprintf("File %s exists and 'updatefile' set to FALSE.", excel_file))
wb <- openxlsx::loadWorkbook(excel_file)
sheets <- openxlsx::getSheetNames(excel_file)
if (sheet %in% sheets){
if (overwritesheet==FALSE) stop(sprintf("Sheet %s exists in file and 'overwritesheet' set to FALSE, please try again with a new sheet name or with 'overwritesheet=TRUE' option", sheet))
openxlsx::removeWorksheet(wb, sheet)
}
openxlsx::addWorksheet(wb, sheet)
if (asTable==TRUE){
openxlsx::writeDataTable(wb, sheet, df)
} else {
openxlsx::writeData(wb, sheet, df)
}
openxlsx::saveWorkbook(wb, excel_file, overwrite = updatefile)
# openxlsx::write.xlsx(df, excel_file, sheetName = sheet, row.names=FALSE, ...)
if (isTRUE(return.tibble) & ('tibble' %in% row.names(installed.packages()))) df <- tibble::as_tibble(df)
return(invisible(df))
}
# extract hyperlinks from excel sheets (see https://stackoverflow.com/a/70676505/5346827)
util$hyperlinks_from_excel <- function(aExcelFile, ExcelSheet="all",
aRefOutputFile = NULL){
req_pkgs <- c("XML", "purrr", "dplyr", "readxl")
if (!'rlang' %in% row.names(installed.packages())) stop("'rlang' package is not installed, please install it and try again")
# if (!"rlang" %in% installed.packages() require(XML)) stop("XML package is not installed")
rlang::check_installed(req_pkgs,
reason = "Package XML is required to extract the urls from the sheets, while purrr and dplyr are needed to process the results")
# ExcelSheet="submission_info"
# aExcelFile="sample_info/AGRF_NGS_yield.xlsx"
sheets <- readxl::excel_sheets(aExcelFile)
read_relationships <- function(aSheetIndex, tmp_dir){
# tmp_dir= tmp_base_dir
# aSheetIndex = 2
filename <- file.path(tmp_dir, 'xl', 'worksheets', '_rels', paste0('sheet', aSheetIndex, '.xml.rels'))
rel <- XML::xmlParse(filename) %>% XML::xmlToList() %>% purrr::map_dfr(as.list) %>%
dplyr::filter(grepl("hyperlink$", Type)) %>%
dplyr::select(id=Id, target=Target) #%>% setNames()
if(nrow(rel) == 0){
return(NULL)
}
filename <- file.path(tmp_dir, 'xl', 'worksheets', paste0('sheet', aSheetIndex, '.xml'))
pos <- XML::xmlParse(filename) %>% XML::xmlToList()
if(is.null(pos$hyperlinks)){
return(NULL)
}
ret <- purrr::map_dfr(pos$hyperlinks, as.list) %>%
# dplyr::select(-uid) %>%
dplyr::inner_join(rel, by = 'id') %>%
dplyr::mutate(tab_idx=aSheetIndex)
link_text <- ret %>% dplyr::select(text=display, target) %>%
dplyr::filter(!is.na(text)) %>%
dplyr::distinct()
return(ret %>% left_join(link_text))
}
EXCEL_TEMP_NAME <- 'unzipped_excel'
tmp_base_dir <- file.path(tempdir(),
paste0('tmpexcl',
as.character(round(runif(1, 1000000000000, 9999999999999)))))
dir.create(tmp_base_dir)
on.exit(unlink(tmp_base_dir))
zipfile <- file.path(tmp_base_dir, paste0(EXCEL_TEMP_NAME, '.zip'))
file.copy(from = aExcelFile, to = zipfile)
unzip(zipfile, exdir = tmp_base_dir)
sheet_idx <- NULL
# ExcelSheet=sheets[1]
if (is.numeric(ExcelSheet) & all(ExcelSheet %in% seq_along(sheets))) {
sheet_idx <- ExcelSheet
} else if (is.character(ExcelSheet) & all(ExcelSheet %in% sheets)) {
sheet_idx <- which(sheets %in% ExcelSheet)
} else if (tolower(ExcelSheet)=="all") {
sheet_idx <- seq_along(sheets)
}
if (length(sheet_idx)==0) stop("Sheets provided to 'ExcelSheet' argument not present in the Excel file")
res <- purrr::map_dfr(sheet_idx, ~read_relationships(.x, tmp_base_dir)) %>%
dplyr::mutate(sheet = sheets[tab_idx]) %>%
dplyr::mutate(ref = paste0("'", sheet, "'!", ref)) %>%
dplyr::select(id,tab_idx, sheet, ref, target, text)
if(!is.null(aRefOutputFile)){
write_csv(res, aRefOutputFile)
}
return(res)
}
########################################
## Misc small routines
# calculates standard error
util$se <- function(x, na.rm=TRUE) {
if(na.rm) return(sqrt(var(x, na.rm=TRUE)/length(x[!is.na(x)]))) else return(sqrt(var(x)/length(x)))
}
# calculate geometric mean (see https://stackoverflow.com/a/25555105/5346827)
util$gm_mean = function(x, na.rm=TRUE){
exp(sum(log(x[x > 0]), na.rm=na.rm) / length(x))
}
util$as.c <- as.character
# split vector into chunks (see https://stackoverflow.com/questions/3318333/split-a-vector-into-chunks)
util$chunk <- function(x, n) (mapply(function(a, b) (x[a:b]), seq.int(from=1, to=length(x), by=n), pmin(seq.int(from=1, to=length(x), by=n)+(n-1), length(x)), SIMPLIFY=FALSE))
util$chunk2 <- function(x,n) split(x, cut(seq_along(x), n, labels = FALSE))
util$unwhich <- function(indices, len=length(indices)) {
# reverse of which(): from indices to boolean mask.
ret = rep(F,len)
ret[indices] = T
ret
}
util$nna <- function(...) !is.na(...) # i type this a lot, i think its worth 3 characters + shift key
util$shuffle <- function(...) UseMethod("shuffle")
util$shuffle.default <- function(x) x[order(runif(length(x)))]
util$shuffle.data.frame <- function(x) x[order(runif(nrow(x))),]
util$sample_df <- function(d, size=10, ...) {
samp = sample(1:nrow(d), size=size, ...)
d[samp,]
}
# split a dataframe into subsets by size
util$split_df <- function(df, group_size){
split(df, gl(ceiling(nrow(df)/group_size), group_size, nrow(df)))
}
util$present_levels <- function(x) intersect(levels(x), x)
util$trim_levels <- function(...) UseMethod("trim_levels")
util$trim_levels.factor <- function(x) factor(x, levels=present_levels(x))
util$trim_levels.data.frame <- function(x) {
for (n in names(x))
if (is.factor(x[,n]))
x[,n] = trim_levels(x[,n])
x
}
util$unfactor <- function(df){
id <- sapply(df, is.factor)
df[id] <- lapply(df[id], as.character)
df
}
# grep() returns indices of matches. Variants:
util$bgrep <- function(pat,x, ...) {
# "boolean" grep: return a logical vector ready for vector ops
# like & | and others
unwhich(grep(pat,x,...), length(x))
}
util$ngrep <- function(pat,x, ...)
# "normal" grep: return values, not indices
x[grep(pat,x,...)]
########################################
## Other data manipulation routines
util$tidy_transpose <- function(df) {
df %>% t() %>% # Tranpose, but function is for matrices. Return Matrix
as.data.frame() %>% #Force to be dataframe
tibble::rownames_to_column(var = "rowname") %>% #Resave first column from rownames
janitor::row_to_names(row_number = 1) #Resave column headers from first row.
}
util$tapply2 <- function(x, ...) {
# like tapply but preserves factors
if (is.factor(x)) {
r = factor(tapply(as.character(x), ...), levels=levels(x))
} else {
r = tapply(x, ...)
}
r
}
util$inject <- function(collection, start, fn) {
# like lisp reduce. (named after ruby)
acc = start
for (x in collection)
acc = fn(acc, x)
acc
}
##### Bioinformatics Annotation #######
# Shortens and "prettify" long BLAST descriptions
util$pretty_term <- function(term, prefix="Predicted:", comma=FALSE, dot=FALSE, paren=FALSE, cap=TRUE, firstCapOnly=FALSE,
spDot=TRUE, charNum=0, showTrimmed=NULL) {
term <- sapply(term, function (s) {
s_ <- s
if (is.character(prefix) && length(prefix)>0) s_ <- sub(sprintf("^%s\\s*",prefix) ,"", s_, ignore.case = TRUE, perl=TRUE)
if (cap) s_ <- sub("(^.)", "\\U\\1", s_, perl=TRUE)
if (firstCapOnly) s_ <- paste(c(unlist(strsplit(s_, " ", fixed = TRUE))[1], sapply(unlist(strsplit(s_, " ", fixed = TRUE))[-1], function(n) sub("(^.)", "\\L\\1", n, perl=TRUE), USE.NAMES = FALSE)), collapse = " ")
if (spDot) s_ <- sub("sp\\. (.+?)\\.(.+)", "sp. \\1_\\2", s_, perl=TRUE)
if (paren) s_ <- sub(" \\(.+\\)", "", s_)
if (dot) s_ <- unlist(strsplit(s_, ".", fixed = TRUE))[1]
if (comma) s_ <- unlist(strsplit(s_, ",", fixed = TRUE))[1]
# Trim to the end of the word if character number limit is reached and remove any trailing commas or dots
if (charNum>0) s_ <- sub("[,|\\.]$", "", sub(sprintf("(^.{%s}[\\S]*)[ ]*.*", charNum), "\\1", s_, perl=TRUE), perl = TRUE)
# Add a ^ at the end if the term was trimmed anyhow
if (nchar(s_)<nchar(s) && !grepl("\\*$", s_) && !is.null(showTrimmed)) return(paste0(s_, as.character(showTrimmed)))
else return(s_)
}, USE.NAMES = FALSE)
return(term)
}
# Find unique Tax_name
util$find_uniq_tax <- function(desc, taxName, prettify=FALSE) {
if (is.na(taxName) || taxName=="N/A") uniqTax <- ifelse((is.na(desc) || !grepl(".+\\[(.*)\\]", desc)), NA, sub(".+\\[(.*)\\]","\\1", desc))
else {
if (grepl(";", taxName)) {
taxList <- unlist(strsplit(taxName, ";"))
taxDesc <- sub(".+\\[(.*)\\]","\\1", desc)
uniqTax <- ifelse(taxDesc %in% taxList, taxDesc, taxList[length(taxList)])
} else uniqTax <- taxName
}
if (prettify) uniqTax <- pretty_term(uniqTax, prefix="Undetermined", comma=FALSE, dot=FALSE, paren=FALSE, cap=TRUE, spDot=TRUE, charNum=0, showTrimmed=FALSE)
return(uniqTax)
}
# Resolve scientific name (with taxize package)
util$resolve_scientific <- function(tax_names) {
tax_count=0
resolved <- lapply(tax_names, function(x) {
tax_count <<- tax_count + 1
perc <- tax_count/length(tax_names)*100
message(sprintf("Processing %s (%.1f%% completed)", x, perc))
res_name <- try(taxize::gnr_resolve(names = x, data_source_ids=4, canonical = TRUE,best_match_only=FALSE ), TRUE)
if (!inherits(res_name, "try-error")) return(res_name$results[which.max(res_name$results$score),])
else {
message(sprintf("Warning: Could not process %s correctly, returning NAs", x))
return(data.frame(submitted_name=x,matched_name=NA,data_source_title=NA,score=NA))
}
} )
return(resolved)
}
# Define a function that accepts a matrix of DE contrasts and DESeq data to analyse each contrast
util$get_deseq2_res <- function(dds_deseq2, mat_col, group_col = "sample_group") {
contrast <- c(group_col,rev(mat_col))
comparison <- paste0(rev(mat_col), collapse="_vs_")
res <- DESeq2::results(dds_deseq2, contrast=contrast, tidy = TRUE) %>% as_tibble() %>%
dplyr::rename(gene_id=row) %>%
dplyr::mutate(contrast=comparison, analysis_date=analysis_date) %>%
dplyr::filter(!is.na(padj)) %>% dplyr::arrange(padj, desc(abs(log2FoldChange)))
return(res)
}
# Modify plotPCA to return all 4 PCs
util$plotPCA4 <- function (dds_deseq2, intgroup = "condition", ntop = 500, PCs=3:4, returnData = FALSE){
rv <- genefilter::rowVars(SummarizedExperiment::assay(dds_deseq2))
select <- order(rv, decreasing = TRUE)[seq_len(min(ntop,
length(rv)))]
pca <- prcomp(t(SummarizedExperiment::assay(dds_deseq2)[select, ]))
percentVar <- pca$sdev^2/sum(pca$sdev^2)
if (!all(intgroup %in% names(SummarizedExperiment::colData(dds_deseq2)))) {
stop("the argument 'intgroup' should specify columns of colData(dds)")
}
intgroup.df <- as.data.frame(SummarizedExperiment::colData(dds_deseq2)[, intgroup,
drop = FALSE])
group <- if (length(intgroup) > 1) {
factor(apply(intgroup.df, 1, paste, collapse = " : "))
}
else {
SummarizedExperiment::colData(dds_deseq2)[[intgroup]]
}
d <- data.frame(PC1 = pca$x[, 1], PC2 = pca$x[, 2], PC3 = pca$x[, 3], PC4 = pca$x[, 4], group = group,
intgroup.df, name = colnames(dds_deseq2))
if (returnData) {
attr(d, "percentVar") <- percentVar[1:4]
return(d)
}
ggplot2::ggplot(data = d, aes_string(x = paste0("PC", PCs[1]), y = paste0("PC", PCs[2]), color = "group")) +
ggplot2::geom_point(size = 3) +
ggplot2::labs(x = sprintf("PC%d: %.2f%% variance", PCs[1], percentVar[PCs[1]]*100),
y = sprintf("PC%d: %.2f%% variance", PCs[2], percentVar[PCs[2]]*100)) + ggplot2::coord_fixed()
# + ylab(paste0("PC4: ", round(percentVar[2] * 100), "% variance"))
}
# blast headers (see https://www.metagenomics.wiki/tools/blast/blastn-output-format-6)
util$blast_std_headers <- c("qseqid", "sseqid", "pident", "length", "mismatch", "gapopen",
"qstart", "qend", "sstart", "send", "evalue", "bitscore")
# blast extended headers (with titles and taxonomy)
util$blast_tax_headers <- c("qseqid", "sseqid", "pident", "length", "mismatch", "gapopen",
"qstart", "qend", "sstart", "send", "evalue", "bitscore",
'stitle', 'staxids', 'ssciname', 'scomname')
util$interpro_headers <- c("protein","md5","length","analysis","accession","description","start","stop","score","status",
"date","interpro_accession","interpro_description","go_annotation","pathway_annotation")
########################################
## Printing, viewing
## see also: str()
## Transparent colors
## Mark Gardener 2015
## www.dataanalytics.org.uk
util$t_col <- function(color, percent = 50, name = NULL) {
# color = color name, percent = % transparency, name = an optional name for the color
## Get RGB values for named color
rgb.val <- col2rgb(color)
## Make new color using input color as base and alpha set by transparency
t.col <- rgb(rgb.val[1], rgb.val[2], rgb.val[3],
max = 255,
alpha = (100-percent)*255/100,
names = name)
## Save the color
invisible(t.col)
}
util$printf <- function(...) cat(sprintf(...))
util$listprint <- function(x) {
s = paste(sapply(names(x), function(n) sprintf("%s=%s", n,x[[n]])), collapse=' ')
printf("%s\n", s)
}
util$msg <- function(...) cat(..., "\n", file=stderr())
util$LogMsg <- function(text, dateformat="%d-%m-%Y %I:%M:%S", ...) {
if (is.null(dateformat) | (dateformat==FALSE)) return(cat(paste0(text, "\n")))
msg <- sprintf(paste0(format(Sys.time(), dateformat), ": ", text, "\n"), ...)
cat(msg)
}
util$h = utils::head
util$ppy <- function(x, column.major=FALSE, ...) {
# pretty-print as yaml. intended for rows with big textual cells.
# a la mysql's \G operator
library(yaml)
cat(as.yaml(x, column.major=column.major), ...)
cat("\n", ...)
}
util$table_html = function(...) {
# Intended for inside dosink()
columns = list(...)
ncol = length(columns)
nrow = length(columns[[1]])
# assume columns are in parallel
printf("\n<table cellpadding=3 border=1 cellspacing=0 bordercolor=gray>")
for (i in 1:nrow) {
printf("\n<tr>")
for (j in 1:ncol)
printf("\n <td>%s", columns[[j]][i])
}
printf("\n</table>\n")
}
########################################
## Workspace management
# Fix devtools
util$.fixdevtools <- function() {
dev_reqs <- c("httr", "devtools")
install.deps(dev_reqs)
httr::set_config( httr::config( ssl_verifypeer = 0L ) )
options(buildtools.check = function(action) TRUE )
}
# Silently load a package, can change mode to "require"
util$silent.load <- function(a.package, mode="library"){
eval(parse(text=sprintf("suppressWarnings(suppressPackageStartupMessages(
%s(a.package, character.only=TRUE)))", mode)))
}
# install and load packages from CRAN, Bioconductor or git, see usage below
util$install.deps <- function(p, repo="cran", reposite=getOption("repos"), silent=FALSE, force=FALSE){
# repos=getOption("repos")
if (is.null(reposite)) reposite <- "http://cloud.r-project.org/"
call_install <- switch(repo,
cran=sprintf("install.packages(package, repos=\"%s\"", reposite),
bioc="biocLite(package, suppressUpdates=TRUE, ask=FALSE",
git="install_github(package")
if (repo=="git") .fixdevtools()
if (repo=="bioc" & !all(p %in% utils::installed.packages())) {
#source("http://bioconductor.org/biocLite.R")
if (!"RCurl" %in% utils::installed.packages()) {
install.packages("RCurl", repos=reposite)
}
eval(parse(text = RCurl::getURL("http://bioconductor.org/biocLite.R",
ssl.verifypeer=FALSE)))
biocLite(suppressUpdates=TRUE, ask=FALSE)
}
for (package in p) {
gitNames <- unlist(strsplit(package, "/"))
pName <- gitNames[length(gitNames)]
if (!pName %in% utils::installed.packages() | isTRUE(force)) {
cat(sprintf("Please wait, installing and loading missing package %s and its dependencies from %s.\n",
package, repo), file=stderr())
suppressWarnings(eval(parse(text = sprintf("%s, quiet = TRUE, lib=.libPaths()[1])",call_install))))
if (!pName %in% utils::installed.packages()) {
ifelse(!interactive(),q("no", 2, TRUE),
stop(sprintf("Unable to install missing package %s from %s.\n",
package, repo), call. = FALSE))
}
}
if (isTRUE(silent)) silent.load(pName, mode="require")
else require(pName, character.only=T, quietly=TRUE, warn.conflicts=TRUE)
}
}
# CRAN_packages <- c("RCurl", "dplyr", "tidyr", "ggplot2", "RColorBrewer", "RSQLite")
# install.deps(CRAN_packages)
# improved list of objects
# http://stackoverflow.com/questions/1358003/tricks-to-manage-the-available-memory-in-an-r-session
util$list_objects = function (pos = 1, pattern) {
napply = function(names, fn) sapply(names, function(x)
fn(get(x, pos = pos)))
names = ls(pos = pos, pattern = pattern)
N = length(names)
obj_class = napply(names, function(x) as.character(class(x))[1])
obj_mode = napply(names, mode)
obj_type = ifelse(is.na(obj_class), obj_mode, obj_class)
obj_prettysize = napply(names, function(x) {
capture.output(print(object.size(x), units = "auto")) })
obj_size = napply(names, object.size)
obj_prettysize[obj_size < 1e6] = ""
obj_length = napply(names, function(x) length(x))
obj_dim = t(napply(names, function(x)
as.numeric(dim(x))[1:2]))
is_flat = is.na(obj_dim)[, 1]
is_vector = napply(names, function(x) is.vector(x) & class(x) != 'list')
info_width = max(20, options('width')$width - 60)
small_str = function(x) {
out = capture.output(
str(x, max.level=0, give.attr=F, give.head=F, width=info_width, strict.width='cut')
)
out = str_c(out,collapse=' ')
out = cutoff(str_replace(out,"\n"," "))
if (str_detect(out, "^List of"))
out = str_c("[Names] $ ", str_c(names(x),collapse=' '))
cutoff(out)
}
cutoff = function(s) {
if (str_length(s) >= info_width) {
str_c(str_sub(s,1,info_width-2),'..')
} else {
s
}
}
pad = function(s) sprintf(" %s", s)
out <- data.frame(
Type = obj_type,
Size = obj_prettysize,
Dim = ifelse(is_vector | is_flat, obj_length,
sprintf("(%s, %s)", obj_dim[,1], obj_dim[,2])),
Value = napply(names, function(x)
if (class(x) %in% c('data.frame','list') && !is.null(names(x)))
cutoff(str_c("[Names] $ ",str_c(names(x), collapse=' ')))
else small_str(x)
),
stringsAsFactors=F)
row.names(out) = names
out$Dim = sprintf(" %s", out$Dim)
out$Value = sprintf(str_c(" %-", info_width, "s"), out$Value)
out = rbind(subset(out, Type!='function'), subset(out, Type=='function'))
out
}
util$lsos = function() {
d = list_objects()
d$name = row.names(d)
d = subset(d, name != 'util')
row.names(d)=d$name
d$name=NULL
d
}
########################################
## For performance optimization and long-running jobs
util$timeit <- function(expr, name=NULL) {
# print how long the expression takes, and return its value too.
# So you can interpose timeit({ blabla }) around any chunk of code "blabla".
start = Sys.time()
ret = eval(expr)
finish = Sys.time()
if (!is.null(name)) cat(name,": ")
print(finish-start)
invisible(ret)
}
util$dotprogress <- function(callback, interval=10) {
# intended to wrap the anonymous callback for sapply() or somesuch.
# ALTERNATIVE: plyr *ply(.progress='text')
count = 0
return(function(...) {
if ((count <<- count+1) %% interval == 0)
cat(".")
callback(...)
})
}
########################################
## External programs for interactivity
########################################
## Graphics output wrappers
## For easy one-liners, like:
## dopdf("tmp.pdf",width=5,height=5,cmd=plot(x,y))
util$dopdf <- function(filename,..., cmd) {
pdf(filename, ...)
eval(cmd)
dev.off()
if (exists('OPEN') && OPEN)
system(sprintf("open %s", filename))
}
util$dopng <- function(filename,..., cmd) {
png(filename, ...)
eval(cmd)
dev.off()
if ((exists('OPEN') && OPEN))
system(sprintf("open %s", filename))
}
util$dosink <- function(filename,cmd, open=NULL) {
# like capture.output() but follows open/OPEN conventions here
sink(filename)
eval(cmd)
sink(NULL)
if (prio_check(open, exists('OPEN') && OPEN))
system(sprintf("open %s", filename))
}
util$dosvg <- function(filename, ..., cmd, open=NULL) {
require("RSvgDevice")
devSVG(filename, ...)
eval(cmd)
dev.off()
if (prio_check(open, exists('OPEN') && OPEN))
system(sprintf("open %s", filename))
}
# An R function to save pheatmap figure into pdf
# This was copied from Stackflow: https://stackoverflow.com/questions/43051525/how-to-draw-pheatmap-plot-to-screen-and-also-save-to-file
util$save_pheatmap_pdf <- function(x, filename, width=7, height=7) {
stopifnot(!missing(x))
stopifnot(!missing(filename))
pdf(filename, width=width, height=height)
grid::grid.newpage()
grid::grid.draw(x$gtable)
dev.off()
}
# A default plotting theme
util$plot_theme <- function(def_theme="grey", baseSize=22, strip_fill="lightskyblue"){
require("ggplot2")
my_theme <- eval(parse(text = sprintf("theme_%s(base_size=%d)", def_theme,baseSize))) +
theme(axis.title.y=element_text(face="bold", vjust = 1.5, size=rel(0.8)),
axis.title.x=element_text(face="bold", vjust = 0.1, size=rel(0.8)),
legend.title=element_text(size=rel(0.85), hjust=0.5),
legend.text=element_text(size = rel(0.75),lineheight = 1.5),
axis.text.y=element_text(size=rel(0.85)),
axis.text.x=element_text(size=rel(0.85)),
#panel.grid.minor=element_blank(),
strip.text=element_text(size=rel(0.75)),
strip.background=element_rect(fill = strip_fill))
return(my_theme)
}
# plot correlation matrix (https://statsandr.com/blog/correlation-coefficient-and-correlation-test-in-r/)
# do not edit
util$corrplot2 <- function(data,
method = "pearson",
sig.level = 0.05,
order = "original",
diag = FALSE,
type = "upper",
tl.srt = 90,
number.font = 1,
number.cex = 1,
mar = c(0, 0, 0, 0)) {
require("corrplot")
data_incomplete <- data
data <- data[complete.cases(data), ]
mat <- cor(data, method = method)
cor.mtest <- function(mat, method) {
mat <- as.matrix(mat)
n <- ncol(mat)
p.mat <- matrix(NA, n, n)
diag(p.mat) <- 0
for (i in 1:(n - 1)) {
for (j in (i + 1):n) {
tmp <- cor.test(mat[, i], mat[, j], method = method)
p.mat[i, j] <- p.mat[j, i] <- tmp$p.value
}
}
colnames(p.mat) <- rownames(p.mat) <- colnames(mat)
p.mat
}
p.mat <- cor.mtest(data, method = method)
col <- colorRampPalette(c("#BB4444", "#EE9988", "#FFFFFF", "#77AADD", "#4477AA"))
corrplot(mat,
method = "color", col = col(200), number.font = number.font,
mar = mar, number.cex = number.cex,
type = type, order = order,
addCoef.col = "black", # add correlation coefficient
tl.col = "black", tl.srt = tl.srt, # rotation of text labels
# combine with significance level
p.mat = p.mat, sig.level = sig.level, insig = "blank",
# hide correlation coefficients on the diagonal
diag = diag
)
}
# This function makes a pretty ggplot theme (https://ourcodingclub.github.io/tutorials/dataviz-beautification-synthesis/#distributions)
# This function takes no arguments
# meaning that you always have just niwot_theme() and not niwot_theme(something else here)
util$theme_niwot <- function(baseSize=16){
theme_bw(base_size=baseSize) +
theme(text = element_text(family = "Helvetica Light"),
axis.text = element_text(size = rel(0.85)),
# axis.title = element_text(size = 18),
axis.line.x = element_line(color="black"),
axis.line.y = element_line(color="black"),
panel.border = element_blank(),
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank(),
panel.grid.minor.y = element_blank(),
panel.grid.major.y = element_blank(),
plot.margin = unit(c(1, 1, 1, 1), units = , "cm"),
plot.title = element_text(vjust = 1, hjust = 0),
legend.text = element_text(size = rel(0.85)),
legend.title = element_blank(),
legend.position = c(0.95, 0.15),
legend.key = element_blank(),
legend.background = element_rect(color = "black",
fill = "transparent",
size = 2, linetype = "blank"))
}
# Create a filename with folder and current date/time before the extension
util$filedate <- function(filename, ext="pdf", outdir=".", dateformat="%d_%m_%Y"){
if (outdir!="." & !file.exists(outdir)) dir.create(outdir, recursive = TRUE)
ext <- sub("\\.*(.+)", "\\1", ext)
if (is.null(dateformat)) return(file.path(outdir, sprintf("%s.%s", filename, ext)))
if (dateformat==FALSE) return(file.path(outdir, sprintf("%s.%s", filename, ext)))
return(file.path(outdir, sprintf("%s_%s.%s", filename, format(Sys.time(), dateformat), ext)))
}
########################################
## Plotting routines
# Create nice log axis breaks (see package "scales")
util$log_breaks <- function(n = 10, minor=TRUE, major=TRUE, pretty=FALSE){
function(x) {
b <- axisTicks(log10(range(x, na.rm = TRUE)), log = TRUE, n = n)
if (!minor) b <- b[grepl("1", prettyNum(b))]
if (!major) b <- b[grepl("5", prettyNum(b))]
if (pretty) b <- prettyNum(b)
return(b)
}
}
# Create log axis labels
util$base_breaks <- function(n = 10){
function(x) {
axisTicks(log10(range(x, na.rm = TRUE)), log = TRUE, n = n)
}
}
# A function factory for getting integer y-axis values.
util$integer_breaks <- function(n = 5, ...) {
fxn <- function(x) {
breaks <- floor(pretty(x, n, ...))
names(breaks) <- attr(breaks, "labels")
breaks
}
return(fxn)
}
util$linelight <- function(x,y, lty='dashed', col='lightgray', ...) {
# highlight a point with lines running to the axes.
left = par('usr')[1]
bot = par('usr')[3]
segments(left,y, x,y, lty=lty, col=col, ...)
segments(x,bot, x,y, lty=lty, col=col, ...)
}
util$hintonplot <- function(mat, max_value=max(abs(mat)), mid_value=0, ...) {
# Plots a matrix as colored, size-varying boxes
# I dunno who started calling this a "Hinton plot", but anyways
# Example:
# hintonplot(matrix(rnorm(100),10))
# Example, for counts:
# table(cyl=mtcars$cyl, mpg=cut(mtcars$mpg,3))
# mpg
# cyl (10.4,18.2] (18.2,26.1] (26.1,33.9]
# 4 0 6 5
# 6 2 5 0
# 8 12 2 0
# hintonplot(table(cyl=mtcars$cyl, mpg=cut(mtcars$mpg,3)))
plot.new()
plot.window(xlim=c(0.5,ncol(mat)+0.5), ylim=c(0.5,nrow(mat)+0.5))
x_mid = 1:ncol(mat)
y_mid = 1:nrow(mat)
area = abs(mat) / max_value
side = sqrt(area)
for (x in 1:ncol(mat)) {
for (y in nrow(mat):1) {
# ym = (nrow(mat):1)[y]
ym = y
d = side[ym,x] / 2
rect(x-d, y-d, x+d, y+d, col=if (mat[ym,x]>0) 'darkblue' else 'darkred')
}
}
axis(1, 1:ncol(mat), labels=colnames(mat))
# axis(2, nrow(mat):1, labels=row.names(mat))
axis(2, 1:nrow(mat), labels=row.names(mat))
title(xlab=names(dimnames(mat))[2], ylab=names(dimnames(mat))[1], ...)
}
########################################
## Rmarkdown functions
# make captions for DT tables
util$DT_caption <- function(ref, caption, tab_width="100%"){
# Do not use underscores or spaces in references!
if (grepl("[_ ]", ref)) stop("Do not use underscores (_) or spaces ( ) in figure or table reference!")
cat(sprintf("<table width=%s>",tab_width) ,
sprintf("<caption>(#tab:%s)%s</caption>", ref, caption),"</table>",
sep ="\n")
}
########################################
## Has to be last in file
while("util" %in% search())
detach("util")
attach(util)
@IdoBar
Copy link
Author

IdoBar commented Jun 27, 2018

Fixed filedate() to recursively create folder names

@IdoBar
Copy link
Author

IdoBar commented Jun 4, 2019

Remove dependency on xlsx package (requires Java) in favor of openxlsx in write_xslx()

@IdoBar
Copy link
Author

IdoBar commented Nov 23, 2019

Improved functionality of write_xlsx() (check dependencies, optional conversion to tibble)

@IdoBar
Copy link
Author

IdoBar commented May 29, 2022

Added Correlation plot function corrplot2() (see source)

@IdoBar
Copy link
Author

IdoBar commented Apr 27, 2023

Added split_df() to split a dataframe into equal-sized subsets (returns a list of dataframes).
See source on SO

@IdoBar
Copy link
Author

IdoBar commented Sep 1, 2023

Added chunk() to split a vector to chunks (very useful for working with purrr).
See source on SO

@IdoBar
Copy link
Author

IdoBar commented Aug 29, 2024

Added NCBI blast headers (see this link)
Available as vectors: blast_std_headers and blast_tax_headers

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment