Skip to content

Instantly share code, notes, and snippets.

@talegari
Last active January 30, 2017 08:42
Show Gist options
  • Save talegari/fa4e8e72b3a9e23ede2e to your computer and use it in GitHub Desktop.
Save talegari/fa4e8e72b3a9e23ede2e to your computer and use it in GitHub Desktop.
misc R utilities
#######################################################################
# utilities
#######################################################################
# author : talegari (Srikanth KS)
# license : GNU AGPLv3 (http://choosealicense.com/licenses/agpl-3.0/)
#######################################################################
# intro ----
# This is a set of quick utilities(functions) written in R(3.2.2)
# Most of the functions do not come with input sanity check.
# package dependencies
# magrittr
# dplyr
# index ----
# pChange : percentage change from ...
# reverse : reverse a vector
# fv : frequency vector
# rotate : rotate a vector forward or backward
# nonNA : length of non-NA elements in a vector
# findOptParam : Find optimal param value across seeds
# freqRatio : frequency ratio
# sumTo : scale non-negative reals so they sum to ...
# fracUnique : fraction of unique values
# addLineNums : adds ID columnas the first column of the DF
# naBy : replacing NA by something
# nvToDf : create a df from a named vector
# rownamesAsColumn: adds rownames as the first column ...
# cmu : current memory usage
# mmu : maximum memory usage
# jaccard_m : multiset jaccard for two sets A and B
# count : count the number of occurances of pattern in a vector
# colwise_percent : scale columns of non-negative matrix so that each column sums to 100
# rowwise_percent : scale rows of non-negative matrix so that each row sums to 100
# to_percent : convert a numeric vector to a vector of percentages
# pChange ----
# percentage change from ...
#
# input: integer/numeric vector
# output: numeric vector
pChange <- function(vec,from=vec[1]){
(vec - from) %>%
divide_by(from) %>%
multiply_by(100)
}
# reverse ----
# reverse a vector
#
# input: vector
# output: vector
reverse <- function(vec){
vec[seq(from = length(vec)
,to = 1
,by = -1)]
}
# frequency vector ----
#
# input: vector
# output: named integer vector
fv <- function(vec){
if (vec %>% length == 0)
return(0)
t <- vec %>% table
f <- as.integer(t)
names(f) <- names(t)
f[vec %>% unique %>% as.character]
}
# rotate ----
# rotate a vector forward or backward
#
# input: vector
# output: vector
rotate <- function(vec,by=1){
ell <- length(vec)
if(by < 0)
by <- ell - mod(abs(by),ell)
vec[seq(from = 0,to = ell - 1)
%>% add(by)
%>% mod(ell)
%>% add(1)]
}
# n_nonNA ----
# length of non-NA elements in a vector
#
# input: vector
# output: vector
n_nonNA <- function(vec){
sum(is.na(vec) == F)
}
# nonNA ----
# get non-NA elements of a vector
# The resulting NULL is to be later checked with 'is.logical'
#
# input: vector
# output: vector
nonNA <- function(vec){
index <- !is.na(vec)
if (sum(index) == 0)
return(NULL)
vec[index]
}
# findoptparam ----
# Find the most optimal parameter value across seeds
# Motivation
# Different training/testing breakups(seeds) give
# different MSE/accuracy's for differing parameters.
# We intend to find the parameter value which
# min/maximizes the MSE/accuracy.
# Idea
# ranking: rank params for each seed, aggregate over seeds.
# median: consider the median values of the params over seeds
# trimmedMean: trim is passed 'as is' to the mean
# Input format
# matrix with different seeds breakups are rows
# different parameter are columns
# options for optimizeTo: 'min','max'
# option for strategy: 'rankingSum','median','trimmedMean'
# verbose is set to False by default
# Output format
# if verbose is False
# column number indicating the param
# if verbose is True
# param: column number indicating the param.
# sortedParams: sorted scoreSums of params with named params
# pc: percentage change of params wrt to the chosen param
# sd.param: sd of each param across seeds
# sd.seeds: sd of each seed across params
findOptParam <- function(im # input matrix
, optimizeTo = 'min'
, strategy = 'rankingSum'
, verbose = F
, trim = 10)
{
if (strategy == 'rankingSum')
scoreSums <- apply(im,1,rank) %>% t %>% colSums
else if (strategy == 'median')
scoreSums <- apply(im,1,median)
else if (strategy == 'trimmedMean')
scoreSums <- apply(im,1,function(x){mean(x,trim = trim)})
# handling output
if (verbose == F) {
# return param name
if (optimizeTo == 'min') {
which.min(scoreSums) %>% return}
else
which.max(scoreSums) %>% return}
else {
# return a detailed list
if (optimizeTo == 'min') {
sortOrder <- order(scoreSums,decreasing = F)
sortedParams <- scoreSums[sortOrder]
list(
colNum = which.min(scoreSums)
,sortedParams = rbind(sortOrder,sortedParams)
,pc = pChange(sortedParams,from = min(sortedParams)) %>% abs
,sd.param = apply(im,2,sd)
,sd.seeds = apply(im,1,sd))
}
else {
sortOrder <- order(scoreSums,decreasing = T)
sortedParams <- scoreSums[sortOrder]
list(
colNum = which.max(scoreSums)
,sortedParams = rbind(sortOrder,sortedParams)
,pc = pChange(sortedParams,from = max(sortedParams)) %>% abs
,sd.param = apply(im,2,sd)
,sd.seeds = apply(im,1,sd))
}
} # end of returning detailed list
}
# freqRatio -----
# freqRatio : frequency ratio
# see how it handles empty vector
#
# input: vector
# output: named vector
freqRatio <- function(vec){
if (vec %>% length == 0)
return(0)
t <- table(vec)
freq <- t %>% as.integer
names(freq) <- names(t)
ratio <- freq/sum(freq)
ratio[vec %>% unique %>% as.character]
}
# sumTo ----
# sumTo: scale(and not center) non-negative reals
# so they add up to something
#
# input: numeric vector
# output: numeric vector
sumTo <- function(vec,to = 1)
{
if (any(vec < 0))
stop('sumTo needs non-negative reals as a numeric vector')
if (all(vec == 0))
return(vec)
(vec/sum(vec)) * to
}
# fracUnique ----
# fracUnique: fraction of unique values
#
# input: vector
# output: integer/numeric
fracUnique <- function(vec){
ifelse(length(vec) == 0
,0
,n_distinct(vec)/length(vec))
}
# addLineNums ----
# adds 'id' column as the first column of the DF
addLineNums <- function(df){
if ('id' %in% names(df))
stop("ERROR in 'addLineNums': There already exists a column 'id'.")
df %>%
mutate(id = seq_len(df %>% nrow)) %>%
select(id,everything())
}
# naBy ---------------------------------------------
# replacing NA by something, defaults to 0
naBy <- function(df,by = 0){
df[which(is.na(df),arr.ind = T)] <- by
df
}
# nvToDf ----
# create a df from a named vector
nvToDf <- function(nv,stringsAsFactors = F){
if (length(nv) == 0)
return(NULL)
if (nv %>% names %>% is.null)
names <- as.character(1:length(nv))
else
names <- names(nv)
data.frame(rownames = names
,value = nv
,row.names = NULL
,stringsAsFactors = stringsAsFactors)
}
# rownamesAsColumn ----
# adds rownames as the first column to the df and removes rownames
rownamesAsColumn <- function(df,asFactor = F){
if (asFactor == F)
df %>%
mutate(rownames = rownames(df)) %>%
select(rownames,everything())
else
df %>%
mutate(rownames = rownames(df) %>% as.factor) %>%
select(rownames,everything())
}
# cmu - current memory usage ----
# utility to do garbage collection and check current memory usage
# if unit is not mb, memory is reported in gb
cmu <- function(unit = 'mb'){
gcO <- gc(verbose = F)
gcinfo(verbose = F)
ifelse(unit == 'mb'
,gcO[,2] %>% sum
,gcO[,2] %>% sum %>% divide_by(2^10)
)
}
# mmu - maximum memory usage ----
# utility to do garbage collection and check maximum memory usage
# run gc(reset = T) before a big computation
# if unit is not mb, memory is reported in gb
mmu <- function(unit = 'mb'){
gcO <- gc(verbose = F)
gcinfo(verbose = F)
ifelse(unit == 'mb'
,gcO[,6] %>% sum
,gcO[,6] %>% sum %>% divide_by(2^10)
)
}
# multiset jaccard for two sets A and B ----
# Let n(a,A) be the number of occurences of element a in multiset A.
# definition: numerator/denominator where
# numerator = sum( min( n(a,A) , n(a,B) ) for every a in union(A,B))
# denominator = sum( max( n(a,A) , n(a,B) ) for every a in union(A,B))
jaccard_m = function(vec1, vec2){
all = union(vec1, vec2)
counts = lapply( all
, function(x){
c(count(vec1, x), count(vec2, x))
})
num = vapply(counts, min, integer(1))
den = vapply(counts, max, integer(1))
return( sum(num)/sum(den) )
}
# count the number of occurances of pattern in a vector ----
count = function(vec, pattern){
return( sum(grepl(pattern, vec, perl = TRUE)) )
}
# scale columns of non-negative matrix so that each column sums to 100
# a shorter version using `scale` may be written
colwise_percent <- function(mat){
# check whether we have a non-negative matrix
stopifnot(is.matrix(mat) &&
typeof(mat) %in% c("integer", "double") &&
all(mat >= 0)
)
# return a percentage matrix
res_mat <- apply(mat
, 2
, function(x){
s <- sum(x, na.rm = TRUE)
if(s == 0){
res <- rep(0, length(x))
} else {
res <- (x/s) * 100
}
return(res)
}
)
return(res_mat)
}
# scale rows of non-negative matrix so that each row sums to 100
# a shorter version using `scale` may be written
rowwise_percent <- function(mat){
# check whether we have a non-negative matrix
stopifnot(is.matrix(mat) &&
typeof(mat) %in% c("integer", "double") &&
all(mat >= 0)
)
# return a percentage matrix
res_mat <- t(apply(mat
, 1
, function(x){
s <- sum(x, na.rm = TRUE)
if(s == 0){
res <- rep(0, length(x))
} else {
res <- (x/s) * 100
}
return(res)
}
)
)
colnames(res_mat) <- colnames(mat)
return(res_mat)
}
# convert a numeric vector to a vector of percentages
to_percent <- function(vec){
stopifnot(is.numeric(vec))
stopifnot(all(vec >= 0))
stopifnot(!anyNA(vec))
s <- sum(vec)
if(s != 0){
return( (vec/sum(vec)) * 100 )
} else {
return(vec)
}
}
# intapprox
#
# Produce an integer vector approximation of a non-decreasing non-negative
# numeric vector such that it sums to an positive integer
intapprox <- function(vec, n){
stopifnot(is.numeric(vec) || is.integer(vec))
stopifnot(identical(unname(rank(vec, ties.method = "last")), length(vec):1))
stopifnot(all(vec >= 0))
stopifnot(is.numeric(n) && n > 0&& round(n) == n && length(n) == 1)
ratios <- (vec/sum(vec)) * n
intpart <- floor(ratios)
fracpart <- ratios - intpart
leftout <- n - sum(intpart)
if(leftout != 0){
addat <- order(fracpart, length(vec):1, decreasing = TRUE)[1:leftout]
intpart[addat] <- intpart[addat] + 1
}
names(intpart) <- names(vec)
return(intpart)
}
#' @title csv_chunkIndex
#'
#' @description add repeated column to a CSV
#'
#' @param file Path to input CSV file
#' @param size Length of each index
#' @param sep Separator
#' @param colname Column name for the index. It must be NULL if and only if CSV
#' file lacks a header
#' @param outfile Path to output file. If NULL, 'outfile.csv' will be created at
#' working directory.
csv_chunkIndex <- function(file
, size = 1e4
, sep = ","
, colname = NULL
, outfile = NULL){
# assertions ----
if(.Platform$OS.type != "unix"){
stop("Works only on unix based systems.")
}
if(!file_test("-f", file)){
stop("'file' does not exist.")
}
if(size <= 1 || as.integer(size) != size){
stop("'size' must be an integer greater than 1.")
}
if(!(is.character(sep) && length(sep) == 1)){
stop("'sep' must be a string.")
}
if(!is.null(colname)){
if(!(is.character(colname) && length(colname) == 1)){
stop("'colname' must be a string.")
}
}
if(!is.null(outfile)){
if(file_test("-f", outfile)){
stop("'outfile' already exists.")
}
} else{
file.create("outfile.csv")
outfile <- normalizePath("outfile.csv")
}
file.create("outfile_temp.csv")
on.exit(unlink("outfile_temp.csv"))
# find number of lines of the file ----
nol <- system(paste0("wc -l ", normalizePath(file)), intern = TRUE)
nol <- as.integer(strsplit(nol, " ")[[1]][1])
if(!is.null(colname)){
nol <- nol -1
}
# find the number of things ----
num <- floor(nol/size)
rem <- nol %% size
# create one time repeated file ----
if(num != 0){
write(1:num
, file = "outfile_temp.csv"
, ncolumns = 1)
}
# create repeated file ----
repCommand <- paste0("perl -ne 'for$i(0.."
, as.character(size - 1)
, "){print}' "
, "outfile_temp.csv"
, " > "
, outfile
)
system(repCommand, wait = TRUE)
# write reminder lines ----
if(rem != 0){
write(rep(num + 1, rem)
, file = outfile
, ncolumns = 1
, append = TRUE
)
}
# write colname ----
if(!is.null(colname)){
system(paste0("sed -i '1i ", colname, "' ", outfile), wait = TRUE)
}
# rbind to the CSV file ----
system(paste0("cp ", outfile, " outfile_temp.csv"))
unlink(outfile)
system(paste0("paste -d "
, sep
, " "
, "outfile_temp.csv"
, " "
, normalizePath(file)
, " > "
, outfile
)
, wait = TRUE
)
return(outfile)
}
#' @examples
#' write.csv(mtcars, "mtcars.csv", sep = ",", row.names = FALSE)
#' csv_chunkIndex("mtcars.csv"
#' , size = 5
#' , colname = "index"
#' , outfile = "mtcars2.csv"
#' )
#' read.csv("mtcars2.csv")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment