Last active
January 30, 2017 08:42
-
-
Save talegari/fa4e8e72b3a9e23ede2e to your computer and use it in GitHub Desktop.
misc R utilities
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
####################################################################### | |
# utilities | |
####################################################################### | |
# author : talegari (Srikanth KS) | |
# license : GNU AGPLv3 (http://choosealicense.com/licenses/agpl-3.0/) | |
####################################################################### | |
# intro ---- | |
# This is a set of quick utilities(functions) written in R(3.2.2) | |
# Most of the functions do not come with input sanity check. | |
# package dependencies | |
# magrittr | |
# dplyr | |
# index ---- | |
# pChange : percentage change from ... | |
# reverse : reverse a vector | |
# fv : frequency vector | |
# rotate : rotate a vector forward or backward | |
# nonNA : length of non-NA elements in a vector | |
# findOptParam : Find optimal param value across seeds | |
# freqRatio : frequency ratio | |
# sumTo : scale non-negative reals so they sum to ... | |
# fracUnique : fraction of unique values | |
# addLineNums : adds ID columnas the first column of the DF | |
# naBy : replacing NA by something | |
# nvToDf : create a df from a named vector | |
# rownamesAsColumn: adds rownames as the first column ... | |
# cmu : current memory usage | |
# mmu : maximum memory usage | |
# jaccard_m : multiset jaccard for two sets A and B | |
# count : count the number of occurances of pattern in a vector | |
# colwise_percent : scale columns of non-negative matrix so that each column sums to 100 | |
# rowwise_percent : scale rows of non-negative matrix so that each row sums to 100 | |
# to_percent : convert a numeric vector to a vector of percentages | |
# pChange ---- | |
# percentage change from ... | |
# | |
# input: integer/numeric vector | |
# output: numeric vector | |
pChange <- function(vec,from=vec[1]){ | |
(vec - from) %>% | |
divide_by(from) %>% | |
multiply_by(100) | |
} | |
# reverse ---- | |
# reverse a vector | |
# | |
# input: vector | |
# output: vector | |
reverse <- function(vec){ | |
vec[seq(from = length(vec) | |
,to = 1 | |
,by = -1)] | |
} | |
# frequency vector ---- | |
# | |
# input: vector | |
# output: named integer vector | |
fv <- function(vec){ | |
if (vec %>% length == 0) | |
return(0) | |
t <- vec %>% table | |
f <- as.integer(t) | |
names(f) <- names(t) | |
f[vec %>% unique %>% as.character] | |
} | |
# rotate ---- | |
# rotate a vector forward or backward | |
# | |
# input: vector | |
# output: vector | |
rotate <- function(vec,by=1){ | |
ell <- length(vec) | |
if(by < 0) | |
by <- ell - mod(abs(by),ell) | |
vec[seq(from = 0,to = ell - 1) | |
%>% add(by) | |
%>% mod(ell) | |
%>% add(1)] | |
} | |
# n_nonNA ---- | |
# length of non-NA elements in a vector | |
# | |
# input: vector | |
# output: vector | |
n_nonNA <- function(vec){ | |
sum(is.na(vec) == F) | |
} | |
# nonNA ---- | |
# get non-NA elements of a vector | |
# The resulting NULL is to be later checked with 'is.logical' | |
# | |
# input: vector | |
# output: vector | |
nonNA <- function(vec){ | |
index <- !is.na(vec) | |
if (sum(index) == 0) | |
return(NULL) | |
vec[index] | |
} | |
# findoptparam ---- | |
# Find the most optimal parameter value across seeds | |
# Motivation | |
# Different training/testing breakups(seeds) give | |
# different MSE/accuracy's for differing parameters. | |
# We intend to find the parameter value which | |
# min/maximizes the MSE/accuracy. | |
# Idea | |
# ranking: rank params for each seed, aggregate over seeds. | |
# median: consider the median values of the params over seeds | |
# trimmedMean: trim is passed 'as is' to the mean | |
# Input format | |
# matrix with different seeds breakups are rows | |
# different parameter are columns | |
# options for optimizeTo: 'min','max' | |
# option for strategy: 'rankingSum','median','trimmedMean' | |
# verbose is set to False by default | |
# Output format | |
# if verbose is False | |
# column number indicating the param | |
# if verbose is True | |
# param: column number indicating the param. | |
# sortedParams: sorted scoreSums of params with named params | |
# pc: percentage change of params wrt to the chosen param | |
# sd.param: sd of each param across seeds | |
# sd.seeds: sd of each seed across params | |
findOptParam <- function(im # input matrix | |
, optimizeTo = 'min' | |
, strategy = 'rankingSum' | |
, verbose = F | |
, trim = 10) | |
{ | |
if (strategy == 'rankingSum') | |
scoreSums <- apply(im,1,rank) %>% t %>% colSums | |
else if (strategy == 'median') | |
scoreSums <- apply(im,1,median) | |
else if (strategy == 'trimmedMean') | |
scoreSums <- apply(im,1,function(x){mean(x,trim = trim)}) | |
# handling output | |
if (verbose == F) { | |
# return param name | |
if (optimizeTo == 'min') { | |
which.min(scoreSums) %>% return} | |
else | |
which.max(scoreSums) %>% return} | |
else { | |
# return a detailed list | |
if (optimizeTo == 'min') { | |
sortOrder <- order(scoreSums,decreasing = F) | |
sortedParams <- scoreSums[sortOrder] | |
list( | |
colNum = which.min(scoreSums) | |
,sortedParams = rbind(sortOrder,sortedParams) | |
,pc = pChange(sortedParams,from = min(sortedParams)) %>% abs | |
,sd.param = apply(im,2,sd) | |
,sd.seeds = apply(im,1,sd)) | |
} | |
else { | |
sortOrder <- order(scoreSums,decreasing = T) | |
sortedParams <- scoreSums[sortOrder] | |
list( | |
colNum = which.max(scoreSums) | |
,sortedParams = rbind(sortOrder,sortedParams) | |
,pc = pChange(sortedParams,from = max(sortedParams)) %>% abs | |
,sd.param = apply(im,2,sd) | |
,sd.seeds = apply(im,1,sd)) | |
} | |
} # end of returning detailed list | |
} | |
# freqRatio ----- | |
# freqRatio : frequency ratio | |
# see how it handles empty vector | |
# | |
# input: vector | |
# output: named vector | |
freqRatio <- function(vec){ | |
if (vec %>% length == 0) | |
return(0) | |
t <- table(vec) | |
freq <- t %>% as.integer | |
names(freq) <- names(t) | |
ratio <- freq/sum(freq) | |
ratio[vec %>% unique %>% as.character] | |
} | |
# sumTo ---- | |
# sumTo: scale(and not center) non-negative reals | |
# so they add up to something | |
# | |
# input: numeric vector | |
# output: numeric vector | |
sumTo <- function(vec,to = 1) | |
{ | |
if (any(vec < 0)) | |
stop('sumTo needs non-negative reals as a numeric vector') | |
if (all(vec == 0)) | |
return(vec) | |
(vec/sum(vec)) * to | |
} | |
# fracUnique ---- | |
# fracUnique: fraction of unique values | |
# | |
# input: vector | |
# output: integer/numeric | |
fracUnique <- function(vec){ | |
ifelse(length(vec) == 0 | |
,0 | |
,n_distinct(vec)/length(vec)) | |
} | |
# addLineNums ---- | |
# adds 'id' column as the first column of the DF | |
addLineNums <- function(df){ | |
if ('id' %in% names(df)) | |
stop("ERROR in 'addLineNums': There already exists a column 'id'.") | |
df %>% | |
mutate(id = seq_len(df %>% nrow)) %>% | |
select(id,everything()) | |
} | |
# naBy --------------------------------------------- | |
# replacing NA by something, defaults to 0 | |
naBy <- function(df,by = 0){ | |
df[which(is.na(df),arr.ind = T)] <- by | |
df | |
} | |
# nvToDf ---- | |
# create a df from a named vector | |
nvToDf <- function(nv,stringsAsFactors = F){ | |
if (length(nv) == 0) | |
return(NULL) | |
if (nv %>% names %>% is.null) | |
names <- as.character(1:length(nv)) | |
else | |
names <- names(nv) | |
data.frame(rownames = names | |
,value = nv | |
,row.names = NULL | |
,stringsAsFactors = stringsAsFactors) | |
} | |
# rownamesAsColumn ---- | |
# adds rownames as the first column to the df and removes rownames | |
rownamesAsColumn <- function(df,asFactor = F){ | |
if (asFactor == F) | |
df %>% | |
mutate(rownames = rownames(df)) %>% | |
select(rownames,everything()) | |
else | |
df %>% | |
mutate(rownames = rownames(df) %>% as.factor) %>% | |
select(rownames,everything()) | |
} | |
# cmu - current memory usage ---- | |
# utility to do garbage collection and check current memory usage | |
# if unit is not mb, memory is reported in gb | |
cmu <- function(unit = 'mb'){ | |
gcO <- gc(verbose = F) | |
gcinfo(verbose = F) | |
ifelse(unit == 'mb' | |
,gcO[,2] %>% sum | |
,gcO[,2] %>% sum %>% divide_by(2^10) | |
) | |
} | |
# mmu - maximum memory usage ---- | |
# utility to do garbage collection and check maximum memory usage | |
# run gc(reset = T) before a big computation | |
# if unit is not mb, memory is reported in gb | |
mmu <- function(unit = 'mb'){ | |
gcO <- gc(verbose = F) | |
gcinfo(verbose = F) | |
ifelse(unit == 'mb' | |
,gcO[,6] %>% sum | |
,gcO[,6] %>% sum %>% divide_by(2^10) | |
) | |
} | |
# multiset jaccard for two sets A and B ---- | |
# Let n(a,A) be the number of occurences of element a in multiset A. | |
# definition: numerator/denominator where | |
# numerator = sum( min( n(a,A) , n(a,B) ) for every a in union(A,B)) | |
# denominator = sum( max( n(a,A) , n(a,B) ) for every a in union(A,B)) | |
jaccard_m = function(vec1, vec2){ | |
all = union(vec1, vec2) | |
counts = lapply( all | |
, function(x){ | |
c(count(vec1, x), count(vec2, x)) | |
}) | |
num = vapply(counts, min, integer(1)) | |
den = vapply(counts, max, integer(1)) | |
return( sum(num)/sum(den) ) | |
} | |
# count the number of occurances of pattern in a vector ---- | |
count = function(vec, pattern){ | |
return( sum(grepl(pattern, vec, perl = TRUE)) ) | |
} | |
# scale columns of non-negative matrix so that each column sums to 100 | |
# a shorter version using `scale` may be written | |
colwise_percent <- function(mat){ | |
# check whether we have a non-negative matrix | |
stopifnot(is.matrix(mat) && | |
typeof(mat) %in% c("integer", "double") && | |
all(mat >= 0) | |
) | |
# return a percentage matrix | |
res_mat <- apply(mat | |
, 2 | |
, function(x){ | |
s <- sum(x, na.rm = TRUE) | |
if(s == 0){ | |
res <- rep(0, length(x)) | |
} else { | |
res <- (x/s) * 100 | |
} | |
return(res) | |
} | |
) | |
return(res_mat) | |
} | |
# scale rows of non-negative matrix so that each row sums to 100 | |
# a shorter version using `scale` may be written | |
rowwise_percent <- function(mat){ | |
# check whether we have a non-negative matrix | |
stopifnot(is.matrix(mat) && | |
typeof(mat) %in% c("integer", "double") && | |
all(mat >= 0) | |
) | |
# return a percentage matrix | |
res_mat <- t(apply(mat | |
, 1 | |
, function(x){ | |
s <- sum(x, na.rm = TRUE) | |
if(s == 0){ | |
res <- rep(0, length(x)) | |
} else { | |
res <- (x/s) * 100 | |
} | |
return(res) | |
} | |
) | |
) | |
colnames(res_mat) <- colnames(mat) | |
return(res_mat) | |
} | |
# convert a numeric vector to a vector of percentages | |
to_percent <- function(vec){ | |
stopifnot(is.numeric(vec)) | |
stopifnot(all(vec >= 0)) | |
stopifnot(!anyNA(vec)) | |
s <- sum(vec) | |
if(s != 0){ | |
return( (vec/sum(vec)) * 100 ) | |
} else { | |
return(vec) | |
} | |
} | |
# intapprox | |
# | |
# Produce an integer vector approximation of a non-decreasing non-negative | |
# numeric vector such that it sums to an positive integer | |
intapprox <- function(vec, n){ | |
stopifnot(is.numeric(vec) || is.integer(vec)) | |
stopifnot(identical(unname(rank(vec, ties.method = "last")), length(vec):1)) | |
stopifnot(all(vec >= 0)) | |
stopifnot(is.numeric(n) && n > 0&& round(n) == n && length(n) == 1) | |
ratios <- (vec/sum(vec)) * n | |
intpart <- floor(ratios) | |
fracpart <- ratios - intpart | |
leftout <- n - sum(intpart) | |
if(leftout != 0){ | |
addat <- order(fracpart, length(vec):1, decreasing = TRUE)[1:leftout] | |
intpart[addat] <- intpart[addat] + 1 | |
} | |
names(intpart) <- names(vec) | |
return(intpart) | |
} | |
#' @title csv_chunkIndex | |
#' | |
#' @description add repeated column to a CSV | |
#' | |
#' @param file Path to input CSV file | |
#' @param size Length of each index | |
#' @param sep Separator | |
#' @param colname Column name for the index. It must be NULL if and only if CSV | |
#' file lacks a header | |
#' @param outfile Path to output file. If NULL, 'outfile.csv' will be created at | |
#' working directory. | |
csv_chunkIndex <- function(file | |
, size = 1e4 | |
, sep = "," | |
, colname = NULL | |
, outfile = NULL){ | |
# assertions ---- | |
if(.Platform$OS.type != "unix"){ | |
stop("Works only on unix based systems.") | |
} | |
if(!file_test("-f", file)){ | |
stop("'file' does not exist.") | |
} | |
if(size <= 1 || as.integer(size) != size){ | |
stop("'size' must be an integer greater than 1.") | |
} | |
if(!(is.character(sep) && length(sep) == 1)){ | |
stop("'sep' must be a string.") | |
} | |
if(!is.null(colname)){ | |
if(!(is.character(colname) && length(colname) == 1)){ | |
stop("'colname' must be a string.") | |
} | |
} | |
if(!is.null(outfile)){ | |
if(file_test("-f", outfile)){ | |
stop("'outfile' already exists.") | |
} | |
} else{ | |
file.create("outfile.csv") | |
outfile <- normalizePath("outfile.csv") | |
} | |
file.create("outfile_temp.csv") | |
on.exit(unlink("outfile_temp.csv")) | |
# find number of lines of the file ---- | |
nol <- system(paste0("wc -l ", normalizePath(file)), intern = TRUE) | |
nol <- as.integer(strsplit(nol, " ")[[1]][1]) | |
if(!is.null(colname)){ | |
nol <- nol -1 | |
} | |
# find the number of things ---- | |
num <- floor(nol/size) | |
rem <- nol %% size | |
# create one time repeated file ---- | |
if(num != 0){ | |
write(1:num | |
, file = "outfile_temp.csv" | |
, ncolumns = 1) | |
} | |
# create repeated file ---- | |
repCommand <- paste0("perl -ne 'for$i(0.." | |
, as.character(size - 1) | |
, "){print}' " | |
, "outfile_temp.csv" | |
, " > " | |
, outfile | |
) | |
system(repCommand, wait = TRUE) | |
# write reminder lines ---- | |
if(rem != 0){ | |
write(rep(num + 1, rem) | |
, file = outfile | |
, ncolumns = 1 | |
, append = TRUE | |
) | |
} | |
# write colname ---- | |
if(!is.null(colname)){ | |
system(paste0("sed -i '1i ", colname, "' ", outfile), wait = TRUE) | |
} | |
# rbind to the CSV file ---- | |
system(paste0("cp ", outfile, " outfile_temp.csv")) | |
unlink(outfile) | |
system(paste0("paste -d " | |
, sep | |
, " " | |
, "outfile_temp.csv" | |
, " " | |
, normalizePath(file) | |
, " > " | |
, outfile | |
) | |
, wait = TRUE | |
) | |
return(outfile) | |
} | |
#' @examples | |
#' write.csv(mtcars, "mtcars.csv", sep = ",", row.names = FALSE) | |
#' csv_chunkIndex("mtcars.csv" | |
#' , size = 5 | |
#' , colname = "index" | |
#' , outfile = "mtcars2.csv" | |
#' ) | |
#' read.csv("mtcars2.csv") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment