Last active
October 31, 2024 02:14
-
-
Save IdoBar/7f63547158ecdbacf31b54a58af0d1cc to your computer and use it in GitHub Desktop.
A set of utilities to make easier R working environment
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# util.R: | |
# Utilities to make R a happier place | |
# Originally created by Brendan O'Connor, brenocon.com/code | |
# Edited by Ido Bar. | |
# Source the file directly from this gist using `devtools::source_gist("7f63547158ecdbacf31b54a58af0d1cc", filename = "util.R")` | |
######################################## | |
## Put everything into an environment, to not pollute global namespace | |
util = new.env() | |
######################################## | |
## Better I/O routines | |
util$repath <- function(x, to=.Platform$OS.type) { | |
xa <- ifelse(to=="windows", gsub('/', '\\\\', x), gsub('\\\\', '/', x)) | |
return(xa) | |
} | |
# search file recursively in a path | |
util$search.path <- function(fn, | |
paths = strsplit(chartr("\\", "/", Sys.getenv("RPATH")), split = | |
switch(.Platform$OS.type, windows = ";", ":"))[[1]]) { | |
for(d in paths) | |
if (file.exists(f <- file.path(d, fn))) | |
return(f) | |
return(NULL) | |
} | |
# Find recent file in a folder (based on pattern to search) | |
# Usage: recent_file("./data/intermediate_files", glue::glue(".*{stacks_name}.+\\.clean.vcf")) | |
util$recent_file <- function(filedir, pattern, sort_by="modified"){ | |
files <- file.info(dir(filedir, pattern, full.names = TRUE)) | |
files$filename <- rownames(files) | |
# files[[paste0(substr(sort_by,1,1), "time")]] <- as.POSIXct(files[[paste0(substr(sort_by,1,1), "time")]]) | |
files[order(files[[paste0(substr(sort_by,1,1), "time")]] , decreasing = TRUE),"filename"][1] | |
} | |
# deprecated in favour of readr::read_tsv() function from tidyverse | |
# util$read.tsv <- function(..., header=F, sep='\t', quote='', comment='', na.strings='', stringsAsFactors=FALSE) { | |
# read.table() wrapper with default settings for no-nonsense, pure TSV | |
# Typical use case is output from another program. | |
# (R's defaults are more geared for human-readable datafiles, which is less | |
# feasible for large-scale data anyway.) | |
# These options are substantially faster than read.table() defaults. | |
# (see e.g. LINK) | |
# stringsAsFactors is the devil. | |
# args = list(...) | |
# args$header = headp | |
# if (!is.null(args$col.names)) { | |
# read.delim() is not smart about this. Yikes. | |
# args$header = FALSE | |
# } | |
# args$sep = sep | |
# args$quote = quote | |
# args$comment = comment | |
# args$stringsAsFactors = stringsAsFactors | |
# args$na.strings = na.strings | |
# do.call(read.delim, args) | |
# } | |
# deprecated in favour of readr::write_tsv() function from tidyverse | |
# util$write.tsv <- function(..., header=NA, col.names=F, row.names=F, sep='\t', na='', quote=F) { | |
# 'header' to 'col.names' naming consistency with read.table() | |
# if (is.finite(header)) col.names = header | |
# write.table(..., col.names=col.names, row.names=row.names, sep=sep, na=na, quote=quote) | |
# } | |
# write fasta files | |
util$write_fasta <- function(id_seq_table, line_width=60, filename){ | |
file.create(filename) | |
apply(id_seq_table, 1, function(entry) cat(sprintf(">%s\n%s\n\n", entry[1], | |
gsub(sprintf("([GTAC]{%i})",line_width), "\\1\n", entry[2])), | |
file = filename, append = TRUE)) | |
if (file.exists(filename)) return(TRUE) | |
else { | |
message(sprintf("Unable to create the file %s, please check that the path exists and is accessible", filename)) | |
return(FALSE) | |
} | |
} | |
# improved implementation of xlsx::write.xlsx (allows overwrite of sheets and return the data as tibble, allowing it to be part of tidyverse pipes) | |
util$write_xlsx <- function(df, excel_file, sheet="Sheet1", overwritesheet=FALSE, updatefile = FALSE, | |
asTable=TRUE, return.tibble=TRUE,...){ | |
if (!'openxlsx' %in% row.names(installed.packages())) stop("Package 'openxlsx' is required and not available, please install it and try again") | |
df <- as.data.frame(df) | |
# if (file.exists(excel_file) & isTRUE(overwritefile)) unlink(excel_file) | |
if (!file.exists(excel_file)) { | |
openxlsx::write.xlsx(df, excel_file, sheetName = sheet, rowNames=FALSE, asTable=asTable, ...) | |
if (isTRUE(return.tibble) & ('tibble' %in% row.names(installed.packages()))) df <- tibble::as_tibble(df) | |
return(invisible(df)) | |
} | |
if (file.exists(excel_file) & !isTRUE(updatefile)) stop(sprintf("File %s exists and 'updatefile' set to FALSE.", excel_file)) | |
wb <- openxlsx::loadWorkbook(excel_file) | |
sheets <- openxlsx::getSheetNames(excel_file) | |
if (sheet %in% sheets){ | |
if (overwritesheet==FALSE) stop(sprintf("Sheet %s exists in file and 'overwritesheet' set to FALSE, please try again with a new sheet name or with 'overwritesheet=TRUE' option", sheet)) | |
openxlsx::removeWorksheet(wb, sheet) | |
} | |
openxlsx::addWorksheet(wb, sheet) | |
if (asTable==TRUE){ | |
openxlsx::writeDataTable(wb, sheet, df) | |
} else { | |
openxlsx::writeData(wb, sheet, df) | |
} | |
openxlsx::saveWorkbook(wb, excel_file, overwrite = updatefile) | |
# openxlsx::write.xlsx(df, excel_file, sheetName = sheet, row.names=FALSE, ...) | |
if (isTRUE(return.tibble) & ('tibble' %in% row.names(installed.packages()))) df <- tibble::as_tibble(df) | |
return(invisible(df)) | |
} | |
# extract hyperlinks from excel sheets (see https://stackoverflow.com/a/70676505/5346827) | |
util$hyperlinks_from_excel <- function(aExcelFile, ExcelSheet="all", | |
aRefOutputFile = NULL){ | |
req_pkgs <- c("XML", "purrr", "dplyr", "readxl") | |
if (!'rlang' %in% row.names(installed.packages())) stop("'rlang' package is not installed, please install it and try again") | |
# if (!"rlang" %in% installed.packages() require(XML)) stop("XML package is not installed") | |
rlang::check_installed(req_pkgs, | |
reason = "Package XML is required to extract the urls from the sheets, while purrr and dplyr are needed to process the results") | |
# ExcelSheet="submission_info" | |
# aExcelFile="sample_info/AGRF_NGS_yield.xlsx" | |
sheets <- readxl::excel_sheets(aExcelFile) | |
read_relationships <- function(aSheetIndex, tmp_dir){ | |
# tmp_dir= tmp_base_dir | |
# aSheetIndex = 2 | |
filename <- file.path(tmp_dir, 'xl', 'worksheets', '_rels', paste0('sheet', aSheetIndex, '.xml.rels')) | |
rel <- XML::xmlParse(filename) %>% XML::xmlToList() %>% purrr::map_dfr(as.list) %>% | |
dplyr::filter(grepl("hyperlink$", Type)) %>% | |
dplyr::select(id=Id, target=Target) #%>% setNames() | |
if(nrow(rel) == 0){ | |
return(NULL) | |
} | |
filename <- file.path(tmp_dir, 'xl', 'worksheets', paste0('sheet', aSheetIndex, '.xml')) | |
pos <- XML::xmlParse(filename) %>% XML::xmlToList() | |
if(is.null(pos$hyperlinks)){ | |
return(NULL) | |
} | |
ret <- purrr::map_dfr(pos$hyperlinks, as.list) %>% | |
# dplyr::select(-uid) %>% | |
dplyr::inner_join(rel, by = 'id') %>% | |
dplyr::mutate(tab_idx=aSheetIndex) | |
link_text <- ret %>% dplyr::select(text=display, target) %>% | |
dplyr::filter(!is.na(text)) %>% | |
dplyr::distinct() | |
return(ret %>% left_join(link_text)) | |
} | |
EXCEL_TEMP_NAME <- 'unzipped_excel' | |
tmp_base_dir <- file.path(tempdir(), | |
paste0('tmpexcl', | |
as.character(round(runif(1, 1000000000000, 9999999999999))))) | |
dir.create(tmp_base_dir) | |
on.exit(unlink(tmp_base_dir)) | |
zipfile <- file.path(tmp_base_dir, paste0(EXCEL_TEMP_NAME, '.zip')) | |
file.copy(from = aExcelFile, to = zipfile) | |
unzip(zipfile, exdir = tmp_base_dir) | |
sheet_idx <- NULL | |
# ExcelSheet=sheets[1] | |
if (is.numeric(ExcelSheet) & all(ExcelSheet %in% seq_along(sheets))) { | |
sheet_idx <- ExcelSheet | |
} else if (is.character(ExcelSheet) & all(ExcelSheet %in% sheets)) { | |
sheet_idx <- which(sheets %in% ExcelSheet) | |
} else if (tolower(ExcelSheet)=="all") { | |
sheet_idx <- seq_along(sheets) | |
} | |
if (length(sheet_idx)==0) stop("Sheets provided to 'ExcelSheet' argument not present in the Excel file") | |
res <- purrr::map_dfr(sheet_idx, ~read_relationships(.x, tmp_base_dir)) %>% | |
dplyr::mutate(sheet = sheets[tab_idx]) %>% | |
dplyr::mutate(ref = paste0("'", sheet, "'!", ref)) %>% | |
dplyr::select(id,tab_idx, sheet, ref, target, text) | |
if(!is.null(aRefOutputFile)){ | |
write_csv(res, aRefOutputFile) | |
} | |
return(res) | |
} | |
######################################## | |
## Misc small routines | |
# calculates standard error | |
util$se <- function(x, na.rm=TRUE) { | |
if(na.rm) return(sqrt(var(x, na.rm=TRUE)/length(x[!is.na(x)]))) else return(sqrt(var(x)/length(x))) | |
} | |
# calculate geometric mean (see https://stackoverflow.com/a/25555105/5346827) | |
util$gm_mean = function(x, na.rm=TRUE){ | |
exp(sum(log(x[x > 0]), na.rm=na.rm) / length(x)) | |
} | |
util$as.c <- as.character | |
# split vector into chunks (see https://stackoverflow.com/questions/3318333/split-a-vector-into-chunks) | |
util$chunk <- function(x, n) (mapply(function(a, b) (x[a:b]), seq.int(from=1, to=length(x), by=n), pmin(seq.int(from=1, to=length(x), by=n)+(n-1), length(x)), SIMPLIFY=FALSE)) | |
util$chunk2 <- function(x,n) split(x, cut(seq_along(x), n, labels = FALSE)) | |
util$unwhich <- function(indices, len=length(indices)) { | |
# reverse of which(): from indices to boolean mask. | |
ret = rep(F,len) | |
ret[indices] = T | |
ret | |
} | |
util$nna <- function(...) !is.na(...) # i type this a lot, i think its worth 3 characters + shift key | |
util$shuffle <- function(...) UseMethod("shuffle") | |
util$shuffle.default <- function(x) x[order(runif(length(x)))] | |
util$shuffle.data.frame <- function(x) x[order(runif(nrow(x))),] | |
util$sample_df <- function(d, size=10, ...) { | |
samp = sample(1:nrow(d), size=size, ...) | |
d[samp,] | |
} | |
# split a dataframe into subsets by size | |
util$split_df <- function(df, group_size){ | |
split(df, gl(ceiling(nrow(df)/group_size), group_size, nrow(df))) | |
} | |
util$present_levels <- function(x) intersect(levels(x), x) | |
util$trim_levels <- function(...) UseMethod("trim_levels") | |
util$trim_levels.factor <- function(x) factor(x, levels=present_levels(x)) | |
util$trim_levels.data.frame <- function(x) { | |
for (n in names(x)) | |
if (is.factor(x[,n])) | |
x[,n] = trim_levels(x[,n]) | |
x | |
} | |
util$unfactor <- function(df){ | |
id <- sapply(df, is.factor) | |
df[id] <- lapply(df[id], as.character) | |
df | |
} | |
# grep() returns indices of matches. Variants: | |
util$bgrep <- function(pat,x, ...) { | |
# "boolean" grep: return a logical vector ready for vector ops | |
# like & | and others | |
unwhich(grep(pat,x,...), length(x)) | |
} | |
util$ngrep <- function(pat,x, ...) | |
# "normal" grep: return values, not indices | |
x[grep(pat,x,...)] | |
######################################## | |
## Other data manipulation routines | |
util$tidy_transpose <- function(df) { | |
df %>% t() %>% # Tranpose, but function is for matrices. Return Matrix | |
as.data.frame() %>% #Force to be dataframe | |
tibble::rownames_to_column(var = "rowname") %>% #Resave first column from rownames | |
janitor::row_to_names(row_number = 1) #Resave column headers from first row. | |
} | |
util$tapply2 <- function(x, ...) { | |
# like tapply but preserves factors | |
if (is.factor(x)) { | |
r = factor(tapply(as.character(x), ...), levels=levels(x)) | |
} else { | |
r = tapply(x, ...) | |
} | |
r | |
} | |
util$inject <- function(collection, start, fn) { | |
# like lisp reduce. (named after ruby) | |
acc = start | |
for (x in collection) | |
acc = fn(acc, x) | |
acc | |
} | |
##### Bioinformatics Annotation ####### | |
# Shortens and "prettify" long BLAST descriptions | |
util$pretty_term <- function(term, prefix="Predicted:", comma=FALSE, dot=FALSE, paren=FALSE, cap=TRUE, firstCapOnly=FALSE, | |
spDot=TRUE, charNum=0, showTrimmed=NULL) { | |
term <- sapply(term, function (s) { | |
s_ <- s | |
if (is.character(prefix) && length(prefix)>0) s_ <- sub(sprintf("^%s\\s*",prefix) ,"", s_, ignore.case = TRUE, perl=TRUE) | |
if (cap) s_ <- sub("(^.)", "\\U\\1", s_, perl=TRUE) | |
if (firstCapOnly) s_ <- paste(c(unlist(strsplit(s_, " ", fixed = TRUE))[1], sapply(unlist(strsplit(s_, " ", fixed = TRUE))[-1], function(n) sub("(^.)", "\\L\\1", n, perl=TRUE), USE.NAMES = FALSE)), collapse = " ") | |
if (spDot) s_ <- sub("sp\\. (.+?)\\.(.+)", "sp. \\1_\\2", s_, perl=TRUE) | |
if (paren) s_ <- sub(" \\(.+\\)", "", s_) | |
if (dot) s_ <- unlist(strsplit(s_, ".", fixed = TRUE))[1] | |
if (comma) s_ <- unlist(strsplit(s_, ",", fixed = TRUE))[1] | |
# Trim to the end of the word if character number limit is reached and remove any trailing commas or dots | |
if (charNum>0) s_ <- sub("[,|\\.]$", "", sub(sprintf("(^.{%s}[\\S]*)[ ]*.*", charNum), "\\1", s_, perl=TRUE), perl = TRUE) | |
# Add a ^ at the end if the term was trimmed anyhow | |
if (nchar(s_)<nchar(s) && !grepl("\\*$", s_) && !is.null(showTrimmed)) return(paste0(s_, as.character(showTrimmed))) | |
else return(s_) | |
}, USE.NAMES = FALSE) | |
return(term) | |
} | |
# Find unique Tax_name | |
util$find_uniq_tax <- function(desc, taxName, prettify=FALSE) { | |
if (is.na(taxName) || taxName=="N/A") uniqTax <- ifelse((is.na(desc) || !grepl(".+\\[(.*)\\]", desc)), NA, sub(".+\\[(.*)\\]","\\1", desc)) | |
else { | |
if (grepl(";", taxName)) { | |
taxList <- unlist(strsplit(taxName, ";")) | |
taxDesc <- sub(".+\\[(.*)\\]","\\1", desc) | |
uniqTax <- ifelse(taxDesc %in% taxList, taxDesc, taxList[length(taxList)]) | |
} else uniqTax <- taxName | |
} | |
if (prettify) uniqTax <- pretty_term(uniqTax, prefix="Undetermined", comma=FALSE, dot=FALSE, paren=FALSE, cap=TRUE, spDot=TRUE, charNum=0, showTrimmed=FALSE) | |
return(uniqTax) | |
} | |
# Resolve scientific name (with taxize package) | |
util$resolve_scientific <- function(tax_names) { | |
tax_count=0 | |
resolved <- lapply(tax_names, function(x) { | |
tax_count <<- tax_count + 1 | |
perc <- tax_count/length(tax_names)*100 | |
message(sprintf("Processing %s (%.1f%% completed)", x, perc)) | |
res_name <- try(taxize::gnr_resolve(names = x, data_source_ids=4, canonical = TRUE,best_match_only=FALSE ), TRUE) | |
if (!inherits(res_name, "try-error")) return(res_name$results[which.max(res_name$results$score),]) | |
else { | |
message(sprintf("Warning: Could not process %s correctly, returning NAs", x)) | |
return(data.frame(submitted_name=x,matched_name=NA,data_source_title=NA,score=NA)) | |
} | |
} ) | |
return(resolved) | |
} | |
# Define a function that accepts a matrix of DE contrasts and DESeq data to analyse each contrast | |
util$get_deseq2_res <- function(dds_deseq2, mat_col, group_col = "sample_group") { | |
contrast <- c(group_col,rev(mat_col)) | |
comparison <- paste0(rev(mat_col), collapse="_vs_") | |
res <- DESeq2::results(dds_deseq2, contrast=contrast, tidy = TRUE) %>% as_tibble() %>% | |
dplyr::rename(gene_id=row) %>% | |
dplyr::mutate(contrast=comparison, analysis_date=analysis_date) %>% | |
dplyr::filter(!is.na(padj)) %>% dplyr::arrange(padj, desc(abs(log2FoldChange))) | |
return(res) | |
} | |
# Modify plotPCA to return all 4 PCs | |
util$plotPCA4 <- function (dds_deseq2, intgroup = "condition", ntop = 500, PCs=3:4, returnData = FALSE){ | |
rv <- genefilter::rowVars(SummarizedExperiment::assay(dds_deseq2)) | |
select <- order(rv, decreasing = TRUE)[seq_len(min(ntop, | |
length(rv)))] | |
pca <- prcomp(t(SummarizedExperiment::assay(dds_deseq2)[select, ])) | |
percentVar <- pca$sdev^2/sum(pca$sdev^2) | |
if (!all(intgroup %in% names(SummarizedExperiment::colData(dds_deseq2)))) { | |
stop("the argument 'intgroup' should specify columns of colData(dds)") | |
} | |
intgroup.df <- as.data.frame(SummarizedExperiment::colData(dds_deseq2)[, intgroup, | |
drop = FALSE]) | |
group <- if (length(intgroup) > 1) { | |
factor(apply(intgroup.df, 1, paste, collapse = " : ")) | |
} | |
else { | |
SummarizedExperiment::colData(dds_deseq2)[[intgroup]] | |
} | |
d <- data.frame(PC1 = pca$x[, 1], PC2 = pca$x[, 2], PC3 = pca$x[, 3], PC4 = pca$x[, 4], group = group, | |
intgroup.df, name = colnames(dds_deseq2)) | |
if (returnData) { | |
attr(d, "percentVar") <- percentVar[1:4] | |
return(d) | |
} | |
ggplot2::ggplot(data = d, aes_string(x = paste0("PC", PCs[1]), y = paste0("PC", PCs[2]), color = "group")) + | |
ggplot2::geom_point(size = 3) + | |
ggplot2::labs(x = sprintf("PC%d: %.2f%% variance", PCs[1], percentVar[PCs[1]]*100), | |
y = sprintf("PC%d: %.2f%% variance", PCs[2], percentVar[PCs[2]]*100)) + ggplot2::coord_fixed() | |
# + ylab(paste0("PC4: ", round(percentVar[2] * 100), "% variance")) | |
} | |
# blast headers (see https://www.metagenomics.wiki/tools/blast/blastn-output-format-6) | |
util$blast_std_headers <- c("qseqid", "sseqid", "pident", "length", "mismatch", "gapopen", | |
"qstart", "qend", "sstart", "send", "evalue", "bitscore") | |
# blast extended headers (with titles and taxonomy) | |
util$blast_tax_headers <- c("qseqid", "sseqid", "pident", "length", "mismatch", "gapopen", | |
"qstart", "qend", "sstart", "send", "evalue", "bitscore", | |
'stitle', 'staxids', 'ssciname', 'scomname') | |
util$interpro_headers <- c("protein","md5","length","analysis","accession","description","start","stop","score","status", | |
"date","interpro_accession","interpro_description","go_annotation","pathway_annotation") | |
######################################## | |
## Printing, viewing | |
## see also: str() | |
## Transparent colors | |
## Mark Gardener 2015 | |
## www.dataanalytics.org.uk | |
util$t_col <- function(color, percent = 50, name = NULL) { | |
# color = color name, percent = % transparency, name = an optional name for the color | |
## Get RGB values for named color | |
rgb.val <- col2rgb(color) | |
## Make new color using input color as base and alpha set by transparency | |
t.col <- rgb(rgb.val[1], rgb.val[2], rgb.val[3], | |
max = 255, | |
alpha = (100-percent)*255/100, | |
names = name) | |
## Save the color | |
invisible(t.col) | |
} | |
util$printf <- function(...) cat(sprintf(...)) | |
util$listprint <- function(x) { | |
s = paste(sapply(names(x), function(n) sprintf("%s=%s", n,x[[n]])), collapse=' ') | |
printf("%s\n", s) | |
} | |
util$msg <- function(...) cat(..., "\n", file=stderr()) | |
util$LogMsg <- function(text, dateformat="%d-%m-%Y %I:%M:%S", ...) { | |
if (is.null(dateformat) | (dateformat==FALSE)) return(cat(paste0(text, "\n"))) | |
msg <- sprintf(paste0(format(Sys.time(), dateformat), ": ", text, "\n"), ...) | |
cat(msg) | |
} | |
util$h = utils::head | |
util$ppy <- function(x, column.major=FALSE, ...) { | |
# pretty-print as yaml. intended for rows with big textual cells. | |
# a la mysql's \G operator | |
library(yaml) | |
cat(as.yaml(x, column.major=column.major), ...) | |
cat("\n", ...) | |
} | |
util$table_html = function(...) { | |
# Intended for inside dosink() | |
columns = list(...) | |
ncol = length(columns) | |
nrow = length(columns[[1]]) | |
# assume columns are in parallel | |
printf("\n<table cellpadding=3 border=1 cellspacing=0 bordercolor=gray>") | |
for (i in 1:nrow) { | |
printf("\n<tr>") | |
for (j in 1:ncol) | |
printf("\n <td>%s", columns[[j]][i]) | |
} | |
printf("\n</table>\n") | |
} | |
######################################## | |
## Workspace management | |
# Fix devtools | |
util$.fixdevtools <- function() { | |
dev_reqs <- c("httr", "devtools") | |
install.deps(dev_reqs) | |
httr::set_config( httr::config( ssl_verifypeer = 0L ) ) | |
options(buildtools.check = function(action) TRUE ) | |
} | |
# Silently load a package, can change mode to "require" | |
util$silent.load <- function(a.package, mode="library"){ | |
eval(parse(text=sprintf("suppressWarnings(suppressPackageStartupMessages( | |
%s(a.package, character.only=TRUE)))", mode))) | |
} | |
# install and load packages from CRAN, Bioconductor or git, see usage below | |
util$install.deps <- function(p, repo="cran", reposite=getOption("repos"), silent=FALSE, force=FALSE){ | |
# repos=getOption("repos") | |
if (is.null(reposite)) reposite <- "http://cloud.r-project.org/" | |
call_install <- switch(repo, | |
cran=sprintf("install.packages(package, repos=\"%s\"", reposite), | |
bioc="biocLite(package, suppressUpdates=TRUE, ask=FALSE", | |
git="install_github(package") | |
if (repo=="git") .fixdevtools() | |
if (repo=="bioc" & !all(p %in% utils::installed.packages())) { | |
#source("http://bioconductor.org/biocLite.R") | |
if (!"RCurl" %in% utils::installed.packages()) { | |
install.packages("RCurl", repos=reposite) | |
} | |
eval(parse(text = RCurl::getURL("http://bioconductor.org/biocLite.R", | |
ssl.verifypeer=FALSE))) | |
biocLite(suppressUpdates=TRUE, ask=FALSE) | |
} | |
for (package in p) { | |
gitNames <- unlist(strsplit(package, "/")) | |
pName <- gitNames[length(gitNames)] | |
if (!pName %in% utils::installed.packages() | isTRUE(force)) { | |
cat(sprintf("Please wait, installing and loading missing package %s and its dependencies from %s.\n", | |
package, repo), file=stderr()) | |
suppressWarnings(eval(parse(text = sprintf("%s, quiet = TRUE, lib=.libPaths()[1])",call_install)))) | |
if (!pName %in% utils::installed.packages()) { | |
ifelse(!interactive(),q("no", 2, TRUE), | |
stop(sprintf("Unable to install missing package %s from %s.\n", | |
package, repo), call. = FALSE)) | |
} | |
} | |
if (isTRUE(silent)) silent.load(pName, mode="require") | |
else require(pName, character.only=T, quietly=TRUE, warn.conflicts=TRUE) | |
} | |
} | |
# CRAN_packages <- c("RCurl", "dplyr", "tidyr", "ggplot2", "RColorBrewer", "RSQLite") | |
# install.deps(CRAN_packages) | |
# improved list of objects | |
# http://stackoverflow.com/questions/1358003/tricks-to-manage-the-available-memory-in-an-r-session | |
util$list_objects = function (pos = 1, pattern) { | |
napply = function(names, fn) sapply(names, function(x) | |
fn(get(x, pos = pos))) | |
names = ls(pos = pos, pattern = pattern) | |
N = length(names) | |
obj_class = napply(names, function(x) as.character(class(x))[1]) | |
obj_mode = napply(names, mode) | |
obj_type = ifelse(is.na(obj_class), obj_mode, obj_class) | |
obj_prettysize = napply(names, function(x) { | |
capture.output(print(object.size(x), units = "auto")) }) | |
obj_size = napply(names, object.size) | |
obj_prettysize[obj_size < 1e6] = "" | |
obj_length = napply(names, function(x) length(x)) | |
obj_dim = t(napply(names, function(x) | |
as.numeric(dim(x))[1:2])) | |
is_flat = is.na(obj_dim)[, 1] | |
is_vector = napply(names, function(x) is.vector(x) & class(x) != 'list') | |
info_width = max(20, options('width')$width - 60) | |
small_str = function(x) { | |
out = capture.output( | |
str(x, max.level=0, give.attr=F, give.head=F, width=info_width, strict.width='cut') | |
) | |
out = str_c(out,collapse=' ') | |
out = cutoff(str_replace(out,"\n"," ")) | |
if (str_detect(out, "^List of")) | |
out = str_c("[Names] $ ", str_c(names(x),collapse=' ')) | |
cutoff(out) | |
} | |
cutoff = function(s) { | |
if (str_length(s) >= info_width) { | |
str_c(str_sub(s,1,info_width-2),'..') | |
} else { | |
s | |
} | |
} | |
pad = function(s) sprintf(" %s", s) | |
out <- data.frame( | |
Type = obj_type, | |
Size = obj_prettysize, | |
Dim = ifelse(is_vector | is_flat, obj_length, | |
sprintf("(%s, %s)", obj_dim[,1], obj_dim[,2])), | |
Value = napply(names, function(x) | |
if (class(x) %in% c('data.frame','list') && !is.null(names(x))) | |
cutoff(str_c("[Names] $ ",str_c(names(x), collapse=' '))) | |
else small_str(x) | |
), | |
stringsAsFactors=F) | |
row.names(out) = names | |
out$Dim = sprintf(" %s", out$Dim) | |
out$Value = sprintf(str_c(" %-", info_width, "s"), out$Value) | |
out = rbind(subset(out, Type!='function'), subset(out, Type=='function')) | |
out | |
} | |
util$lsos = function() { | |
d = list_objects() | |
d$name = row.names(d) | |
d = subset(d, name != 'util') | |
row.names(d)=d$name | |
d$name=NULL | |
d | |
} | |
######################################## | |
## For performance optimization and long-running jobs | |
util$timeit <- function(expr, name=NULL) { | |
# print how long the expression takes, and return its value too. | |
# So you can interpose timeit({ blabla }) around any chunk of code "blabla". | |
start = Sys.time() | |
ret = eval(expr) | |
finish = Sys.time() | |
if (!is.null(name)) cat(name,": ") | |
print(finish-start) | |
invisible(ret) | |
} | |
util$dotprogress <- function(callback, interval=10) { | |
# intended to wrap the anonymous callback for sapply() or somesuch. | |
# ALTERNATIVE: plyr *ply(.progress='text') | |
count = 0 | |
return(function(...) { | |
if ((count <<- count+1) %% interval == 0) | |
cat(".") | |
callback(...) | |
}) | |
} | |
######################################## | |
## External programs for interactivity | |
######################################## | |
## Graphics output wrappers | |
## For easy one-liners, like: | |
## dopdf("tmp.pdf",width=5,height=5,cmd=plot(x,y)) | |
util$dopdf <- function(filename,..., cmd) { | |
pdf(filename, ...) | |
eval(cmd) | |
dev.off() | |
if (exists('OPEN') && OPEN) | |
system(sprintf("open %s", filename)) | |
} | |
util$dopng <- function(filename,..., cmd) { | |
png(filename, ...) | |
eval(cmd) | |
dev.off() | |
if ((exists('OPEN') && OPEN)) | |
system(sprintf("open %s", filename)) | |
} | |
util$dosink <- function(filename,cmd, open=NULL) { | |
# like capture.output() but follows open/OPEN conventions here | |
sink(filename) | |
eval(cmd) | |
sink(NULL) | |
if (prio_check(open, exists('OPEN') && OPEN)) | |
system(sprintf("open %s", filename)) | |
} | |
util$dosvg <- function(filename, ..., cmd, open=NULL) { | |
require("RSvgDevice") | |
devSVG(filename, ...) | |
eval(cmd) | |
dev.off() | |
if (prio_check(open, exists('OPEN') && OPEN)) | |
system(sprintf("open %s", filename)) | |
} | |
# An R function to save pheatmap figure into pdf | |
# This was copied from Stackflow: https://stackoverflow.com/questions/43051525/how-to-draw-pheatmap-plot-to-screen-and-also-save-to-file | |
util$save_pheatmap_pdf <- function(x, filename, width=7, height=7) { | |
stopifnot(!missing(x)) | |
stopifnot(!missing(filename)) | |
pdf(filename, width=width, height=height) | |
grid::grid.newpage() | |
grid::grid.draw(x$gtable) | |
dev.off() | |
} | |
# A default plotting theme | |
util$plot_theme <- function(def_theme="grey", baseSize=22, strip_fill="lightskyblue"){ | |
require("ggplot2") | |
my_theme <- eval(parse(text = sprintf("theme_%s(base_size=%d)", def_theme,baseSize))) + | |
theme(axis.title.y=element_text(face="bold", vjust = 1.5, size=rel(0.8)), | |
axis.title.x=element_text(face="bold", vjust = 0.1, size=rel(0.8)), | |
legend.title=element_text(size=rel(0.85), hjust=0.5), | |
legend.text=element_text(size = rel(0.75),lineheight = 1.5), | |
axis.text.y=element_text(size=rel(0.85)), | |
axis.text.x=element_text(size=rel(0.85)), | |
#panel.grid.minor=element_blank(), | |
strip.text=element_text(size=rel(0.75)), | |
strip.background=element_rect(fill = strip_fill)) | |
return(my_theme) | |
} | |
# plot correlation matrix (https://statsandr.com/blog/correlation-coefficient-and-correlation-test-in-r/) | |
# do not edit | |
util$corrplot2 <- function(data, | |
method = "pearson", | |
sig.level = 0.05, | |
order = "original", | |
diag = FALSE, | |
type = "upper", | |
tl.srt = 90, | |
number.font = 1, | |
number.cex = 1, | |
mar = c(0, 0, 0, 0)) { | |
require("corrplot") | |
data_incomplete <- data | |
data <- data[complete.cases(data), ] | |
mat <- cor(data, method = method) | |
cor.mtest <- function(mat, method) { | |
mat <- as.matrix(mat) | |
n <- ncol(mat) | |
p.mat <- matrix(NA, n, n) | |
diag(p.mat) <- 0 | |
for (i in 1:(n - 1)) { | |
for (j in (i + 1):n) { | |
tmp <- cor.test(mat[, i], mat[, j], method = method) | |
p.mat[i, j] <- p.mat[j, i] <- tmp$p.value | |
} | |
} | |
colnames(p.mat) <- rownames(p.mat) <- colnames(mat) | |
p.mat | |
} | |
p.mat <- cor.mtest(data, method = method) | |
col <- colorRampPalette(c("#BB4444", "#EE9988", "#FFFFFF", "#77AADD", "#4477AA")) | |
corrplot(mat, | |
method = "color", col = col(200), number.font = number.font, | |
mar = mar, number.cex = number.cex, | |
type = type, order = order, | |
addCoef.col = "black", # add correlation coefficient | |
tl.col = "black", tl.srt = tl.srt, # rotation of text labels | |
# combine with significance level | |
p.mat = p.mat, sig.level = sig.level, insig = "blank", | |
# hide correlation coefficients on the diagonal | |
diag = diag | |
) | |
} | |
# This function makes a pretty ggplot theme (https://ourcodingclub.github.io/tutorials/dataviz-beautification-synthesis/#distributions) | |
# This function takes no arguments | |
# meaning that you always have just niwot_theme() and not niwot_theme(something else here) | |
util$theme_niwot <- function(baseSize=16){ | |
theme_bw(base_size=baseSize) + | |
theme(text = element_text(family = "Helvetica Light"), | |
axis.text = element_text(size = rel(0.85)), | |
# axis.title = element_text(size = 18), | |
axis.line.x = element_line(color="black"), | |
axis.line.y = element_line(color="black"), | |
panel.border = element_blank(), | |
panel.grid.major.x = element_blank(), | |
panel.grid.minor.x = element_blank(), | |
panel.grid.minor.y = element_blank(), | |
panel.grid.major.y = element_blank(), | |
plot.margin = unit(c(1, 1, 1, 1), units = , "cm"), | |
plot.title = element_text(vjust = 1, hjust = 0), | |
legend.text = element_text(size = rel(0.85)), | |
legend.title = element_blank(), | |
legend.position = c(0.95, 0.15), | |
legend.key = element_blank(), | |
legend.background = element_rect(color = "black", | |
fill = "transparent", | |
size = 2, linetype = "blank")) | |
} | |
# Create a filename with folder and current date/time before the extension | |
util$filedate <- function(filename, ext="pdf", outdir=".", dateformat="%d_%m_%Y"){ | |
if (outdir!="." & !file.exists(outdir)) dir.create(outdir, recursive = TRUE) | |
ext <- sub("\\.*(.+)", "\\1", ext) | |
if (is.null(dateformat)) return(file.path(outdir, sprintf("%s.%s", filename, ext))) | |
if (dateformat==FALSE) return(file.path(outdir, sprintf("%s.%s", filename, ext))) | |
return(file.path(outdir, sprintf("%s_%s.%s", filename, format(Sys.time(), dateformat), ext))) | |
} | |
######################################## | |
## Plotting routines | |
# Create nice log axis breaks (see package "scales") | |
util$log_breaks <- function(n = 10, minor=TRUE, major=TRUE, pretty=FALSE){ | |
function(x) { | |
b <- axisTicks(log10(range(x, na.rm = TRUE)), log = TRUE, n = n) | |
if (!minor) b <- b[grepl("1", prettyNum(b))] | |
if (!major) b <- b[grepl("5", prettyNum(b))] | |
if (pretty) b <- prettyNum(b) | |
return(b) | |
} | |
} | |
# Create log axis labels | |
util$base_breaks <- function(n = 10){ | |
function(x) { | |
axisTicks(log10(range(x, na.rm = TRUE)), log = TRUE, n = n) | |
} | |
} | |
# A function factory for getting integer y-axis values. | |
util$integer_breaks <- function(n = 5, ...) { | |
fxn <- function(x) { | |
breaks <- floor(pretty(x, n, ...)) | |
names(breaks) <- attr(breaks, "labels") | |
breaks | |
} | |
return(fxn) | |
} | |
util$linelight <- function(x,y, lty='dashed', col='lightgray', ...) { | |
# highlight a point with lines running to the axes. | |
left = par('usr')[1] | |
bot = par('usr')[3] | |
segments(left,y, x,y, lty=lty, col=col, ...) | |
segments(x,bot, x,y, lty=lty, col=col, ...) | |
} | |
util$hintonplot <- function(mat, max_value=max(abs(mat)), mid_value=0, ...) { | |
# Plots a matrix as colored, size-varying boxes | |
# I dunno who started calling this a "Hinton plot", but anyways | |
# Example: | |
# hintonplot(matrix(rnorm(100),10)) | |
# Example, for counts: | |
# table(cyl=mtcars$cyl, mpg=cut(mtcars$mpg,3)) | |
# mpg | |
# cyl (10.4,18.2] (18.2,26.1] (26.1,33.9] | |
# 4 0 6 5 | |
# 6 2 5 0 | |
# 8 12 2 0 | |
# hintonplot(table(cyl=mtcars$cyl, mpg=cut(mtcars$mpg,3))) | |
plot.new() | |
plot.window(xlim=c(0.5,ncol(mat)+0.5), ylim=c(0.5,nrow(mat)+0.5)) | |
x_mid = 1:ncol(mat) | |
y_mid = 1:nrow(mat) | |
area = abs(mat) / max_value | |
side = sqrt(area) | |
for (x in 1:ncol(mat)) { | |
for (y in nrow(mat):1) { | |
# ym = (nrow(mat):1)[y] | |
ym = y | |
d = side[ym,x] / 2 | |
rect(x-d, y-d, x+d, y+d, col=if (mat[ym,x]>0) 'darkblue' else 'darkred') | |
} | |
} | |
axis(1, 1:ncol(mat), labels=colnames(mat)) | |
# axis(2, nrow(mat):1, labels=row.names(mat)) | |
axis(2, 1:nrow(mat), labels=row.names(mat)) | |
title(xlab=names(dimnames(mat))[2], ylab=names(dimnames(mat))[1], ...) | |
} | |
######################################## | |
## Rmarkdown functions | |
# make captions for DT tables | |
util$DT_caption <- function(ref, caption, tab_width="100%"){ | |
# Do not use underscores or spaces in references! | |
if (grepl("[_ ]", ref)) stop("Do not use underscores (_) or spaces ( ) in figure or table reference!") | |
cat(sprintf("<table width=%s>",tab_width) , | |
sprintf("<caption>(#tab:%s)%s</caption>", ref, caption),"</table>", | |
sep ="\n") | |
} | |
######################################## | |
## Has to be last in file | |
while("util" %in% search()) | |
detach("util") | |
attach(util) | |
Fixed filedate()
to recursively create folder names
Remove dependency on xlsx
package (requires Java) in favor of openxlsx
in write_xslx()
Improved functionality of write_xlsx()
(check dependencies, optional conversion to tibble
)
Added Correlation plot function corrplot2()
(see source)
Added split_df()
to split a dataframe into equal-sized subsets (returns a list of dataframes).
See source on SO
Added chunk()
to split a vector to chunks (very useful for working with purrr
).
See source on SO
Added NCBI blast
headers (see this link)
Available as vectors: blast_std_headers
and blast_tax_headers
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Added bioinformatics related utilities
plotPCA4
for DESeq2 objects,resolve_scientifc
andfind_uniq_tax
to handle taxonomy annotation.These utilities should be included in future package.