talegari · January 30, 2017 08:42
diff --git a/utilities.R b/utilities.R
 #######################################################################
 # utilities
 #######################################################################

 # author  : talegari (Srikanth KS)
 # license : GNU AGPLv3 (http://choosealicense.com/licenses/agpl-3.0/)
 #######################################################################

 # intro ----
 # This is a set of quick utilities(functions) written in R(3.2.2)
 # Most of the functions do not come with input sanity check.


 # package dependencies
 # magrittr
 # dplyr

 # index ----
 # pChange         : percentage change from ...
 # reverse         : reverse a vector
 # fv              : frequency vector
 # rotate          : rotate a vector forward or backward
 # nonNA           : length of non-NA elements in a vector
 # findOptParam    : Find optimal param value across seeds
 # freqRatio       : frequency ratio
 # sumTo           : scale non-negative reals so they sum to ...
 # fracUnique      : fraction of unique values
 # addLineNums     : adds ID columnas the first column of the DF
 # naBy            : replacing NA by something
 # nvToDf          : create a df from a named vector
 # rownamesAsColumn: adds rownames as the first column ...
 # cmu             : current memory usage
 # mmu             : maximum memory usage
 # jaccard_m       : multiset jaccard for two sets A and B
 # count           : count the number of occurances of pattern in a vector
 # colwise_percent : scale columns of non-negative matrix so that each column sums to 100
 # rowwise_percent : scale rows of non-negative matrix so that each row sums to 100
 # to_percent      : convert a numeric vector to a vector of percentages

 # pChange ----
 # percentage change from ...
 #
 # input: integer/numeric vector
 # output: numeric vector
 pChange <- function(vec,from=vec[1]){
  (vec - from) %>%
        divide_by(from) %>%
        multiply_by(100)
 }

 # reverse ----
 # reverse a vector
 #
 # input: vector
 # output: vector
 reverse <- function(vec){
  vec[seq(from = length(vec)
          ,to = 1
          ,by = -1)]
 }

 # frequency vector ----
 #
 # input: vector
 # output: named integer vector
 fv <- function(vec){
  if (vec %>% length == 0)
    return(0)
  t <- vec %>% table
  f <- as.integer(t)
  names(f) <- names(t)
  f[vec %>% unique %>% as.character]
 }

 # rotate ----
 # rotate a vector forward or backward
 #
 # input: vector
 # output: vector
 rotate <- function(vec,by=1){
  ell <- length(vec)
  if(by < 0)
    by <- ell - mod(abs(by),ell)
  vec[seq(from = 0,to = ell - 1)
      %>% add(by)
      %>% mod(ell)
      %>% add(1)]
 }

 # n_nonNA ----
 # length of non-NA elements in a vector
 #
 # input: vector
 # output: vector

 n_nonNA <- function(vec){
    sum(is.na(vec) == F)
    }

 # nonNA ----
 # get non-NA elements of a vector
 # The resulting NULL is to be later checked with 'is.logical'
 #
 # input: vector
 # output: vector

 nonNA <- function(vec){
  index <- !is.na(vec)
  if (sum(index) == 0)
    return(NULL)
  vec[index]
 }

 # findoptparam ----
 # Find the most optimal parameter value across seeds

 # Motivation
 # Different training/testing breakups(seeds) give
 # different MSE/accuracy's for differing parameters.
 # We intend to find the parameter value which
 # min/maximizes the MSE/accuracy.

 # Idea
 # ranking: rank params for each seed, aggregate over seeds.
 # median: consider the median values of the params over seeds
 # trimmedMean: trim is passed 'as is' to the mean

 # Input format
 # matrix with different seeds breakups are rows
 # different parameter are columns
 # options for optimizeTo: 'min','max'
 # option for strategy: 'rankingSum','median','trimmedMean'
 # verbose is set to False by default

 # Output format
 # if verbose is False
 #   column number indicating the param
 # if verbose is True
 #   param: column number indicating the param.
 #   sortedParams: sorted scoreSums of params with named params
 #   pc: percentage change of params wrt to the chosen param
 #   sd.param: sd of each param across seeds
 #   sd.seeds: sd of each seed across params

 findOptParam <- function(im # input matrix
                        , optimizeTo = 'min'
                        , strategy = 'rankingSum'
                        , verbose = F
                        , trim = 10)
 {
  if (strategy == 'rankingSum')
    scoreSums <- apply(im,1,rank) %>% t %>% colSums
  else if (strategy == 'median')
    scoreSums <- apply(im,1,median)
  else if (strategy == 'trimmedMean')
    scoreSums <- apply(im,1,function(x){mean(x,trim = trim)})

  # handling output
  if (verbose == F) {
    # return param name
    if (optimizeTo == 'min') {
        which.min(scoreSums) %>% return}
    else
        which.max(scoreSums) %>% return}
  else {
    # return a detailed list
    if (optimizeTo == 'min') {
        sortOrder <- order(scoreSums,decreasing = F)
        sortedParams <- scoreSums[sortOrder]
        list(
            colNum = which.min(scoreSums)
            ,sortedParams = rbind(sortOrder,sortedParams)
            ,pc = pChange(sortedParams,from = min(sortedParams)) %>% abs
            ,sd.param = apply(im,2,sd)
            ,sd.seeds = apply(im,1,sd))
    }
    else {
        sortOrder <- order(scoreSums,decreasing = T)
        sortedParams <- scoreSums[sortOrder]
        list(
            colNum = which.max(scoreSums)
            ,sortedParams = rbind(sortOrder,sortedParams)
            ,pc = pChange(sortedParams,from = max(sortedParams)) %>% abs
            ,sd.param = apply(im,2,sd)
            ,sd.seeds = apply(im,1,sd))
    }
  } # end of returning detailed list
 }

 # freqRatio -----
 # freqRatio : frequency ratio
 # see how it handles empty vector
 #
 # input: vector
 # output: named vector
 freqRatio <- function(vec){
  if (vec %>% length == 0)
    return(0)
  t <- table(vec)
  freq <- t %>% as.integer
  names(freq) <- names(t)
  ratio <- freq/sum(freq)
  ratio[vec %>% unique %>% as.character]
 }

 # sumTo ----
 # sumTo: scale(and not center) non-negative reals
 # so they add up to something
 #
 # input: numeric vector
 # output: numeric vector
 sumTo <- function(vec,to = 1)
  {
    if (any(vec < 0))
        stop('sumTo needs non-negative reals as a numeric vector')
    if (all(vec == 0))
        return(vec)
    (vec/sum(vec)) * to
  }

 # fracUnique ----
 # fracUnique: fraction of unique values
 #
 # input: vector
 # output: integer/numeric
 fracUnique <- function(vec){
  ifelse(length(vec) == 0
         ,0
         ,n_distinct(vec)/length(vec))
 }

 # addLineNums ----
 # adds 'id' column as the first column of the DF
 addLineNums <- function(df){
  if ('id' %in% names(df))
    stop("ERROR in 'addLineNums': There already exists a column 'id'.")
    df %>%
        mutate(id = seq_len(df %>% nrow)) %>%
        select(id,everything())
 }

 # naBy ---------------------------------------------
 # replacing NA by something, defaults to 0
 naBy <- function(df,by = 0){
  df[which(is.na(df),arr.ind = T)] <- by
  df
 }

 # nvToDf ----
 # create a df from a named vector
 nvToDf <- function(nv,stringsAsFactors = F){
    if (length(nv) == 0)
        return(NULL)
    if (nv %>% names %>% is.null)
        names <- as.character(1:length(nv))
    else
        names <- names(nv)
    data.frame(rownames = names
               ,value = nv
               ,row.names = NULL
               ,stringsAsFactors = stringsAsFactors)
 }

 # rownamesAsColumn ----
 # adds rownames as the first column to the df and removes rownames
 rownamesAsColumn <- function(df,asFactor = F){
    if (asFactor == F)
        df %>%
        mutate(rownames = rownames(df)) %>%
        select(rownames,everything())
    else
        df %>%
        mutate(rownames = rownames(df) %>% as.factor) %>%
        select(rownames,everything())
 }

 # cmu - current memory usage ----
 # utility to do garbage collection and check current memory usage
 # if unit is not mb, memory is reported in gb
 cmu <- function(unit = 'mb'){
    gcO <- gc(verbose = F)
    gcinfo(verbose = F)
    ifelse(unit == 'mb'
           ,gcO[,2] %>% sum
           ,gcO[,2] %>% sum %>% divide_by(2^10)
    )
 }

 # mmu - maximum memory usage ----
 # utility to do garbage collection and check maximum memory usage
 # run gc(reset  = T) before a big computation
 # if unit is not mb, memory is reported in gb
 mmu <- function(unit = 'mb'){
    gcO <- gc(verbose = F)
    gcinfo(verbose = F)
    ifelse(unit == 'mb'
           ,gcO[,6] %>% sum
           ,gcO[,6] %>% sum %>% divide_by(2^10)
    )
 }

 # multiset jaccard for two sets A and B ----
 # Let n(a,A) be the number of occurences of element a in multiset A.
 # definition: numerator/denominator where
 # numerator   = sum( min( n(a,A) , n(a,B) ) for every a in union(A,B))
 # denominator = sum( max( n(a,A) , n(a,B) ) for every a in union(A,B))

 jaccard_m = function(vec1, vec2){
  
  all        = union(vec1, vec2)
  counts     = lapply(  all
                      , function(x){
                          c(count(vec1, x), count(vec2, x)) 
                        })
  num        = vapply(counts, min, integer(1))
  den        = vapply(counts, max, integer(1))
  return( sum(num)/sum(den) )
 }

 # count the number of occurances of pattern in a vector ----
 count = function(vec, pattern){
  return( sum(grepl(pattern, vec, perl = TRUE)) )
 }


 # scale columns of non-negative matrix so that each column sums to 100 
 # a shorter version using `scale` may be written

 colwise_percent <- function(mat){
  
  # check whether we have a non-negative matrix
  stopifnot(is.matrix(mat) &&
              typeof(mat) %in% c("integer", "double") &&
              all(mat >= 0)
            )
  # return a percentage matrix
  res_mat <- apply(mat
                   , 2
                   , function(x){
                       s <- sum(x, na.rm = TRUE)
                       if(s == 0){
                         res <- rep(0, length(x))
                       } else {
                         res <- (x/s) * 100
                       }
                       return(res)
                      }
                   )
  
  return(res_mat)
 }

 # scale rows of non-negative matrix so that each row sums to 100 
 # a shorter version using `scale` may be written

 rowwise_percent <- function(mat){
  
  # check whether we have a non-negative matrix
  stopifnot(is.matrix(mat) &&
              typeof(mat) %in% c("integer", "double") &&
              all(mat >= 0)
  )
  # return a percentage matrix
  res_mat <- t(apply(mat
                     , 1
                     , function(x){
                         s <- sum(x, na.rm = TRUE)
                         if(s == 0){
                           res <- rep(0, length(x))
                           } else {
                               res <- (x/s) * 100
                           }
                         return(res)
                         }
                     )
               )
  colnames(res_mat) <- colnames(mat)
  return(res_mat)
 }

 # convert a numeric vector to a vector of percentages
 to_percent <- function(vec){
  stopifnot(is.numeric(vec))
  stopifnot(all(vec >= 0))
  stopifnot(!anyNA(vec))
  
  s <- sum(vec)
  if(s != 0){
    return( (vec/sum(vec)) * 100 )
  } else {
    return(vec)
  }
 }

 # intapprox
 #
 # Produce an integer vector approximation of a non-decreasing non-negative
 # numeric vector such that it sums to an positive integer

 intapprox <- function(vec, n){

  stopifnot(is.numeric(vec) || is.integer(vec))
  stopifnot(identical(unname(rank(vec, ties.method = "last")), length(vec):1))
  stopifnot(all(vec >= 0))
  stopifnot(is.numeric(n) && n > 0&& round(n) == n && length(n) == 1)

  ratios   <- (vec/sum(vec)) * n
  intpart  <- floor(ratios)
  fracpart <- ratios - intpart
  leftout  <- n - sum(intpart)
  if(leftout != 0){
    addat <- order(fracpart, length(vec):1, decreasing = TRUE)[1:leftout]
    intpart[addat] <- intpart[addat] + 1
  }
  names(intpart) <- names(vec)
  return(intpart)
 }
      
 #' @title csv_chunkIndex
 #'
 #' @description add repeated column to a CSV
 #'
 #' @param file Path to input CSV file
 #' @param size Length of each index
 #' @param sep Separator
 #' @param colname Column name for the index. It must be NULL if and only if CSV
 #'   file lacks a header
 #' @param outfile Path to output file. If NULL, 'outfile.csv' will be created at
 #'   working directory.

 csv_chunkIndex <- function(file
                           , size    = 1e4
                           , sep     = ","
                           , colname = NULL
                           , outfile = NULL){

  # assertions                       ----
  if(.Platform$OS.type != "unix"){
    stop("Works only on unix based systems.")
  }

  if(!file_test("-f", file)){
    stop("'file' does not exist.")
  }

  if(size <= 1 || as.integer(size) != size){
    stop("'size' must be an integer greater than 1.")
  }

  if(!(is.character(sep) && length(sep) == 1)){
    stop("'sep' must be a string.")
  }

  if(!is.null(colname)){
    if(!(is.character(colname) && length(colname) == 1)){
      stop("'colname' must be a string.")
    }
  }

  if(!is.null(outfile)){
    if(file_test("-f", outfile)){
      stop("'outfile' already exists.")
    }
  } else{
    file.create("outfile.csv")
    outfile <- normalizePath("outfile.csv")
  }

  file.create("outfile_temp.csv")
  on.exit(unlink("outfile_temp.csv"))

  # find number of lines of the file ----
  nol <- system(paste0("wc -l ", normalizePath(file)), intern = TRUE)
  nol <- as.integer(strsplit(nol, " ")[[1]][1])
  if(!is.null(colname)){
    nol <- nol -1
  }

  # find the number of things        ----
  num <- floor(nol/size)
  rem <- nol %% size

  # create one time repeated file    ----
  if(num != 0){
    write(1:num
          , file = "outfile_temp.csv"
          , ncolumns = 1)
  }

  # create repeated file             ----
  repCommand <-  paste0("perl -ne 'for$i(0.."
                        , as.character(size - 1)
                        , "){print}' "
                        , "outfile_temp.csv"
                        , " > "
                        , outfile
                        )
  system(repCommand, wait = TRUE)

  # write reminder lines             ----
  if(rem != 0){
    write(rep(num + 1, rem)
          , file     = outfile
          , ncolumns = 1
          , append   = TRUE
          )
  }

  # write colname                    ----
  if(!is.null(colname)){
    system(paste0("sed  -i '1i ", colname, "' ", outfile), wait = TRUE)
  }

  # rbind to the CSV file            ----
  system(paste0("cp ", outfile, " outfile_temp.csv"))
  unlink(outfile)
  system(paste0("paste -d "
                , sep
                , " "
                , "outfile_temp.csv"
                , " "
                , normalizePath(file)
                , " > "
                , outfile
                )
         , wait = TRUE
         )

  return(outfile)
 }

 #' @examples
 #' write.csv(mtcars, "mtcars.csv", sep = ",", row.names = FALSE)
 #' csv_chunkIndex("mtcars.csv"
 #'                , size = 5
 #'                , colname = "index"
 #'                , outfile = "mtcars2.csv"
 #'                )
 #' read.csv("mtcars2.csv")
	#######################################################################
	# utilities
	#######################################################################

	# author : talegari (Srikanth KS)
	# license : GNU AGPLv3 (http://choosealicense.com/licenses/agpl-3.0/)
	#######################################################################

	# intro ----
	# This is a set of quick utilities(functions) written in R(3.2.2)
	# Most of the functions do not come with input sanity check.


	# package dependencies
	# magrittr
	# dplyr

	# index ----
	# pChange : percentage change from ...
	# reverse : reverse a vector
	# fv : frequency vector
	# rotate : rotate a vector forward or backward
	# nonNA : length of non-NA elements in a vector
	# findOptParam : Find optimal param value across seeds
	# freqRatio : frequency ratio
	# sumTo : scale non-negative reals so they sum to ...
	# fracUnique : fraction of unique values
	# addLineNums : adds ID columnas the first column of the DF
	# naBy : replacing NA by something
	# nvToDf : create a df from a named vector
	# rownamesAsColumn: adds rownames as the first column ...
	# cmu : current memory usage
	# mmu : maximum memory usage
	# jaccard_m : multiset jaccard for two sets A and B
	# count : count the number of occurances of pattern in a vector
	# colwise_percent : scale columns of non-negative matrix so that each column sums to 100
	# rowwise_percent : scale rows of non-negative matrix so that each row sums to 100
	# to_percent : convert a numeric vector to a vector of percentages

	# pChange ----
	# percentage change from ...
	#
	# input: integer/numeric vector
	# output: numeric vector
	pChange <- function(vec,from=vec[1]){
	(vec - from) %>%
	divide_by(from) %>%
	multiply_by(100)
	}

	# reverse ----
	# reverse a vector
	#
	# input: vector
	# output: vector
	reverse <- function(vec){
	vec[seq(from = length(vec)
	,to = 1
	,by = -1)]
	}

	# frequency vector ----
	#
	# input: vector
	# output: named integer vector
	fv <- function(vec){
	if (vec %>% length == 0)
	return(0)
	t <- vec %>% table
	f <- as.integer(t)
	names(f) <- names(t)
	f[vec %>% unique %>% as.character]
	}

	# rotate ----
	# rotate a vector forward or backward
	#
	# input: vector
	# output: vector
	rotate <- function(vec,by=1){
	ell <- length(vec)
	if(by < 0)
	by <- ell - mod(abs(by),ell)
	vec[seq(from = 0,to = ell - 1)
	%>% add(by)
	%>% mod(ell)
	%>% add(1)]
	}

	# n_nonNA ----
	# length of non-NA elements in a vector
	#
	# input: vector
	# output: vector

	n_nonNA <- function(vec){
	sum(is.na(vec) == F)
	}

	# nonNA ----
	# get non-NA elements of a vector
	# The resulting NULL is to be later checked with 'is.logical'
	#
	# input: vector
	# output: vector

	nonNA <- function(vec){
	index <- !is.na(vec)
	if (sum(index) == 0)
	return(NULL)
	vec[index]
	}

	# findoptparam ----
	# Find the most optimal parameter value across seeds

	# Motivation
	# Different training/testing breakups(seeds) give
	# different MSE/accuracy's for differing parameters.
	# We intend to find the parameter value which
	# min/maximizes the MSE/accuracy.

	# Idea
	# ranking: rank params for each seed, aggregate over seeds.
	# median: consider the median values of the params over seeds
	# trimmedMean: trim is passed 'as is' to the mean

	# Input format
	# matrix with different seeds breakups are rows
	# different parameter are columns
	# options for optimizeTo: 'min','max'
	# option for strategy: 'rankingSum','median','trimmedMean'
	# verbose is set to False by default

	# Output format
	# if verbose is False
	# column number indicating the param
	# if verbose is True
	# param: column number indicating the param.
	# sortedParams: sorted scoreSums of params with named params
	# pc: percentage change of params wrt to the chosen param
	# sd.param: sd of each param across seeds
	# sd.seeds: sd of each seed across params

	findOptParam <- function(im # input matrix
	, optimizeTo = 'min'
	, strategy = 'rankingSum'
	, verbose = F
	, trim = 10)
	{
	if (strategy == 'rankingSum')
	scoreSums <- apply(im,1,rank) %>% t %>% colSums
	else if (strategy == 'median')
	scoreSums <- apply(im,1,median)
	else if (strategy == 'trimmedMean')
	scoreSums <- apply(im,1,function(x){mean(x,trim = trim)})

	# handling output
	if (verbose == F) {
	# return param name
	if (optimizeTo == 'min') {
	which.min(scoreSums) %>% return}
	else
	which.max(scoreSums) %>% return}
	else {
	# return a detailed list
	if (optimizeTo == 'min') {
	sortOrder <- order(scoreSums,decreasing = F)
	sortedParams <- scoreSums[sortOrder]
	list(
	colNum = which.min(scoreSums)
	,sortedParams = rbind(sortOrder,sortedParams)
	,pc = pChange(sortedParams,from = min(sortedParams)) %>% abs
	,sd.param = apply(im,2,sd)
	,sd.seeds = apply(im,1,sd))
	}
	else {
	sortOrder <- order(scoreSums,decreasing = T)
	sortedParams <- scoreSums[sortOrder]
	list(
	colNum = which.max(scoreSums)
	,sortedParams = rbind(sortOrder,sortedParams)
	,pc = pChange(sortedParams,from = max(sortedParams)) %>% abs
	,sd.param = apply(im,2,sd)
	,sd.seeds = apply(im,1,sd))
	}
	} # end of returning detailed list
	}

	# freqRatio -----
	# freqRatio : frequency ratio
	# see how it handles empty vector
	#
	# input: vector
	# output: named vector
	freqRatio <- function(vec){
	if (vec %>% length == 0)
	return(0)
	t <- table(vec)
	freq <- t %>% as.integer
	names(freq) <- names(t)
	ratio <- freq/sum(freq)
	ratio[vec %>% unique %>% as.character]
	}

	# sumTo ----
	# sumTo: scale(and not center) non-negative reals
	# so they add up to something
	#
	# input: numeric vector
	# output: numeric vector
	sumTo <- function(vec,to = 1)
	{
	if (any(vec < 0))
	stop('sumTo needs non-negative reals as a numeric vector')
	if (all(vec == 0))
	return(vec)
	(vec/sum(vec)) * to
	}

	# fracUnique ----
	# fracUnique: fraction of unique values
	#
	# input: vector
	# output: integer/numeric
	fracUnique <- function(vec){
	ifelse(length(vec) == 0
	,0
	,n_distinct(vec)/length(vec))
	}

	# addLineNums ----
	# adds 'id' column as the first column of the DF
	addLineNums <- function(df){
	if ('id' %in% names(df))
	stop("ERROR in 'addLineNums': There already exists a column 'id'.")
	df %>%
	mutate(id = seq_len(df %>% nrow)) %>%
	select(id,everything())
	}

	# naBy ---------------------------------------------
	# replacing NA by something, defaults to 0
	naBy <- function(df,by = 0){
	df[which(is.na(df),arr.ind = T)] <- by
	df
	}

	# nvToDf ----
	# create a df from a named vector
	nvToDf <- function(nv,stringsAsFactors = F){
	if (length(nv) == 0)
	return(NULL)
	if (nv %>% names %>% is.null)
	names <- as.character(1:length(nv))
	else
	names <- names(nv)
	data.frame(rownames = names
	,value = nv
	,row.names = NULL
	,stringsAsFactors = stringsAsFactors)
	}

	# rownamesAsColumn ----
	# adds rownames as the first column to the df and removes rownames
	rownamesAsColumn <- function(df,asFactor = F){
	if (asFactor == F)
	df %>%
	mutate(rownames = rownames(df)) %>%
	select(rownames,everything())
	else
	df %>%
	mutate(rownames = rownames(df) %>% as.factor) %>%
	select(rownames,everything())
	}

	# cmu - current memory usage ----
	# utility to do garbage collection and check current memory usage
	# if unit is not mb, memory is reported in gb
	cmu <- function(unit = 'mb'){
	gcO <- gc(verbose = F)
	gcinfo(verbose = F)
	ifelse(unit == 'mb'
	,gcO[,2] %>% sum
	,gcO[,2] %>% sum %>% divide_by(2^10)
	)
	}

	# mmu - maximum memory usage ----
	# utility to do garbage collection and check maximum memory usage
	# run gc(reset = T) before a big computation
	# if unit is not mb, memory is reported in gb
	mmu <- function(unit = 'mb'){
	gcO <- gc(verbose = F)
	gcinfo(verbose = F)
	ifelse(unit == 'mb'
	,gcO[,6] %>% sum
	,gcO[,6] %>% sum %>% divide_by(2^10)
	)
	}

	# multiset jaccard for two sets A and B ----
	# Let n(a,A) be the number of occurences of element a in multiset A.
	# definition: numerator/denominator where
	# numerator = sum( min( n(a,A) , n(a,B) ) for every a in union(A,B))
	# denominator = sum( max( n(a,A) , n(a,B) ) for every a in union(A,B))

	jaccard_m = function(vec1, vec2){

	all = union(vec1, vec2)
	counts = lapply( all
	, function(x){
	c(count(vec1, x), count(vec2, x))
	})
	num = vapply(counts, min, integer(1))
	den = vapply(counts, max, integer(1))
	return( sum(num)/sum(den) )
	}

	# count the number of occurances of pattern in a vector ----
	count = function(vec, pattern){
	return( sum(grepl(pattern, vec, perl = TRUE)) )
	}


	# scale columns of non-negative matrix so that each column sums to 100
	# a shorter version using `scale` may be written

	colwise_percent <- function(mat){

	# check whether we have a non-negative matrix
	stopifnot(is.matrix(mat) &&
	typeof(mat) %in% c("integer", "double") &&
	all(mat >= 0)
	)
	# return a percentage matrix
	res_mat <- apply(mat
	, 2
	, function(x){
	s <- sum(x, na.rm = TRUE)
	if(s == 0){
	res <- rep(0, length(x))
	} else {
	res <- (x/s) * 100
	}
	return(res)
	}
	)

	return(res_mat)
	}

	# scale rows of non-negative matrix so that each row sums to 100
	# a shorter version using `scale` may be written

	rowwise_percent <- function(mat){

	# check whether we have a non-negative matrix
	stopifnot(is.matrix(mat) &&
	typeof(mat) %in% c("integer", "double") &&
	all(mat >= 0)
	)
	# return a percentage matrix
	res_mat <- t(apply(mat
	, 1
	, function(x){
	s <- sum(x, na.rm = TRUE)
	if(s == 0){
	res <- rep(0, length(x))
	} else {
	res <- (x/s) * 100
	}
	return(res)
	}
	)
	)
	colnames(res_mat) <- colnames(mat)
	return(res_mat)
	}

	# convert a numeric vector to a vector of percentages
	to_percent <- function(vec){
	stopifnot(is.numeric(vec))
	stopifnot(all(vec >= 0))
	stopifnot(!anyNA(vec))

	s <- sum(vec)
	if(s != 0){
	return( (vec/sum(vec)) * 100 )
	} else {
	return(vec)
	}
	}

	# intapprox
	#
	# Produce an integer vector approximation of a non-decreasing non-negative
	# numeric vector such that it sums to an positive integer

	intapprox <- function(vec, n){

	stopifnot(is.numeric(vec) \|\| is.integer(vec))
	stopifnot(identical(unname(rank(vec, ties.method = "last")), length(vec):1))
	stopifnot(all(vec >= 0))
	stopifnot(is.numeric(n) && n > 0&& round(n) == n && length(n) == 1)

	ratios <- (vec/sum(vec)) * n
	intpart <- floor(ratios)
	fracpart <- ratios - intpart
	leftout <- n - sum(intpart)
	if(leftout != 0){
	addat <- order(fracpart, length(vec):1, decreasing = TRUE)[1:leftout]
	intpart[addat] <- intpart[addat] + 1
	}
	names(intpart) <- names(vec)
	return(intpart)
	}

	#' @title csv_chunkIndex
	#'
	#' @description add repeated column to a CSV
	#'
	#' @param file Path to input CSV file
	#' @param size Length of each index
	#' @param sep Separator
	#' @param colname Column name for the index. It must be NULL if and only if CSV
	#' file lacks a header
	#' @param outfile Path to output file. If NULL, 'outfile.csv' will be created at
	#' working directory.

	csv_chunkIndex <- function(file
	, size = 1e4
	, sep = ","
	, colname = NULL
	, outfile = NULL){

	# assertions ----
	if(.Platform$OS.type != "unix"){
	stop("Works only on unix based systems.")
	}

	if(!file_test("-f", file)){
	stop("'file' does not exist.")
	}

	if(size <= 1 \|\| as.integer(size) != size){
	stop("'size' must be an integer greater than 1.")
	}

	if(!(is.character(sep) && length(sep) == 1)){
	stop("'sep' must be a string.")
	}

	if(!is.null(colname)){
	if(!(is.character(colname) && length(colname) == 1)){
	stop("'colname' must be a string.")
	}
	}

	if(!is.null(outfile)){
	if(file_test("-f", outfile)){
	stop("'outfile' already exists.")
	}
	} else{
	file.create("outfile.csv")
	outfile <- normalizePath("outfile.csv")
	}

	file.create("outfile_temp.csv")
	on.exit(unlink("outfile_temp.csv"))

	# find number of lines of the file ----
	nol <- system(paste0("wc -l ", normalizePath(file)), intern = TRUE)
	nol <- as.integer(strsplit(nol, " ")[[1]][1])
	if(!is.null(colname)){
	nol <- nol -1
	}

	# find the number of things ----
	num <- floor(nol/size)
	rem <- nol %% size

	# create one time repeated file ----
	if(num != 0){
	write(1:num
	, file = "outfile_temp.csv"
	, ncolumns = 1)
	}

	# create repeated file ----
	repCommand <- paste0("perl -ne 'for$i(0.."
	, as.character(size - 1)
	, "){print}' "
	, "outfile_temp.csv"
	, " > "
	, outfile
	)
	system(repCommand, wait = TRUE)

	# write reminder lines ----
	if(rem != 0){
	write(rep(num + 1, rem)
	, file = outfile
	, ncolumns = 1
	, append = TRUE
	)
	}

	# write colname ----
	if(!is.null(colname)){
	system(paste0("sed -i '1i ", colname, "' ", outfile), wait = TRUE)
	}

	# rbind to the CSV file ----
	system(paste0("cp ", outfile, " outfile_temp.csv"))
	unlink(outfile)
	system(paste0("paste -d "
	, sep
	, " "
	, "outfile_temp.csv"
	, " "
	, normalizePath(file)
	, " > "
	, outfile
	)
	, wait = TRUE
	)

	return(outfile)
	}

	#' @examples
	#' write.csv(mtcars, "mtcars.csv", sep = ",", row.names = FALSE)
	#' csv_chunkIndex("mtcars.csv"
	#' , size = 5
	#' , colname = "index"
	#' , outfile = "mtcars2.csv"
	#' )
	#' read.csv("mtcars2.csv")