skranz · May 17, 2014 09:10
diff --git a/mutate_if b/mutate_if

 # Tools to make it run
 deparse_all <- function(x) {
  deparse2 <- function(x) paste(deparse(x, width.cutoff = 500L), collapse = "")
  vapply(x, deparse2, FUN.VALUE = character(1))
 }

 dt_env <- function(dt, env) {
  env <- new.env(parent = env, size = 2L)
  env$dt <- dt
  env$vars <- deparse_all(groups(dt))

  env
 }


 # sugessted change of in manip.r

 #' Data manipulation functions.
 #'
 #' These five functions form the backbone of dplyr. They are all S3 generic
 #' functions with methods for each individual data type. All functions work
 #' exactly the same way: the first argument is the tbl, and the
 #' subsequence arguments are interpreted in the context of that tbl.
 #'
 #' @section Manipulation functions:
 #'
 #' The five key data manipulation functions are:
 #'
 #' \itemize{
 #'   \item filter: return only a subset of the rows. If multiple conditions are
 #'     supplied they are combined with \code{&}.
 #'   \item select: return only a subset of the columns. If multiple columns are
 #'     supplied they are all used.
 #'   \item arrange: reorder the rows. Multiple inputs are ordered from left-to-
 #'    right.
 #'   \item mutate: add new columns or replace existing columns. Multiple inputs create multiple columns.
 #'   \item mutate_if: replace selected rows of existing columns.
 #'   \item summarise: reduce each group to a single row. Multiple inputs create
 #'     multiple output summaries.
 #' }
 #'
 #' These are all made significantly more useful when applied by group,
 #' as with \code{\link{group_by}}
 #'
 #' @section Tbls:
 #'
 #' dplyr comes with three built-in tbls.  Read the help for the
 #' manip methods of that class to get more details:
 #'
 #' \itemize{
 #'   \item data.frame: \link{manip_df}
 #'   \item data.table: \link{manip_dt}
 #'   \item SQLite: \code{\link{src_sqlite}}
 #'   \item PostgreSQL: \code{\link{src_postgres}}
 #'   \item MySQL: \code{\link{src_mysql}}
 #' }
 #'
 #' @section Output:
 #'
 #' Generally, manipulation functions will return an output object of the
 #' same type as their input. The exceptions are:
 #'
 #' \itemize{
 #'    \item \code{summarise} will return an ungrouped source
 #'    \item remote sources (like databases) will typically return a local
 #'      source from at least \code{summarise} and \code{mutate}
 #' }
 #'
 #' @section Row names:
 #'
 #' dplyr methods do not preserve row names. If have been using row names
 #' to store important information, please make them explicit variables.
 #'
 #' @name manip
 #' @param .data a tbl
 #' @param ... variables interpreted in the context of that data frame.
 #' @examples
 #' filter(mtcars, cyl == 8)
 #' select(mtcars, mpg, cyl, hp:vs)
 #' arrange(mtcars, cyl, disp)
 #' mutate(mtcars, displ_l = disp / 61.0237)
 #' mutate_if(mtcars,cyl==8, displ_l = disp / 61.0237)
 #' summarise(mtcars, mean(disp))
 #' summarise(group_by(mtcars, cyl), mean(disp))
 NULL


 # code for manip.r

 #' @rdname manip
 #' @export
 mutate_if = function (.data,.if,...) {
  UseMethod("mutate_if")
 }

 # for tbl-data.frame.R

 #' @rdname manip_df
 #' @export
 mutate_if.data.frame =function (.data,.if,...) 
 {
  dt = as.data.table(.data)
  .if.quoted = substitute(.if)
  as.data.frame(mutate_if.data.table(.data=dt,.if.quoted=.if.quoted,...,inplace=TRUE, .parent.env = parent.frame()))
 }

 # for manip-df.r

 #' @rdname manip_df
 #' @export
 mutate_if.tbl_df    <- function (.data,.if,...) {
  dt = as.data.table(.data)
  .if.quoted = substitute(.if)
  tbl_df(mutate_if.data.table(.data=dt,.if.quoted=.if.quoted,...,inplace=TRUE, .parent.env = parent.frame()))
 }

 #' @export
 mutate.tbl_dt <- function(.data,.if, ...) {
  .if.quoted = substitute(.if)
  tbl_dt(
    mutate_if.data.table(.data=.data,.if.quoted=.if.quoted,...,inplace=TRUE, .parent.env = parent.frame())
  )
 }

 # for manip-dt.r

 #' @rdname manip_dt
 #' @export
 mutate_if.data.table <- function (.data,.if, ..., inplace = FALSE,.if.quoted=NULL, .parent.env=parent.frame()) 
 {
  if (is.null(.if.quoted))
    .if.quoted = substitute(.if)

  if (!inplace) 
    .data <- copy(.data)
  env <- new.env(parent = .parent.env, size = 1L)
  env$data <- .data
  cols <- named_dots(...)

  for (i in seq_along(cols)) {
    call <- substitute(data[.if.quoted, `:=`(lhs, rhs)], list(lhs = as.name(names(cols)[[i]]), rhs = cols[[i]], .if.quoted =.if.quoted))
    eval(call, env)
  }
  .data
 }



 # for manip-grouped-dt.r

 #' @rdname manip_grouped_dt
 #' @export
 mutate_if.grouped_dt <- function(.data,.if, ..., inplace = FALSE, .if.quoted=NULL) {
  data <- .data
  if (is.null(.if.quoted))
    .if.quoted = substitute(.if)
  if (!inplace) data <- copy(data)

  env <- dt_env(data, parent.frame())
  cols <- named_dots(...)
  # For each new variable, generate a call of the form df[, new := expr]
  for(col in names(cols)) {
    call <- substitute(dt[.if.quoted, lhs := rhs, by = vars],
      list(lhs = as.name(col), rhs = cols[[col]], .if.quoted=.if.quoted))
    eval(call, env)
  }

  grouped_dt(
    data = data,
    vars = groups(.data)
  )
 }

 #' @rdname manip_grouped_df
 #' @export
 mutate_if.grouped_df <- function(.data,.if, ...) {
  # This function is currently extremely unelegant and inefficient
  # Problem: when transforming to data.table row order will be changed
  # by group_by operation at least in dplyr 0.1.3
  # So I manually restore the original row order
  
  if (NROW(.data)==0)
    return(.data)
  .if.quoted = substitute(.if)
  vars = groups(.data)
  dt = as.data.table(.data)
  class(dt) = c("data.table","data.frame")
  mutate(dt, INDEX.ROW__ = 1:NROW(.data), inplace=TRUE)
  gdt = grouped_dt(dt, vars=vars)
  gdt = mutate_if.grouped_dt(gdt,.if.quoted=.if.quoted,..., inplace=TRUE) 
  data = dplyr:::grouped_df(data=as.data.frame(gdt), vars=vars)
  # restore original order
  data = select(arrange(data, INDEX.ROW__), -INDEX.ROW__)
  data

 }



 examples = function() {
  library(microbenchmark)
  #library(modify)
  library(dplyr)
  library(pryr)
  library(data.table)
  
  # Benckmark compared to directly using data.table or dplyr 
  set.seed(123456)
  n = 1e1
  df = data.frame(a= sample(1:3,n,replace=TRUE),
                   b= sample(1:100,n,replace=TRUE),
                   x=rnorm(n))
  dt = as.data.table(df)
  mutate_if(df,a==3,x=100)
  mutate_if(tbl_df(df),a==1,x=200)
  mutate_if(as.tbl(df),a==1,x=300,b=400)
  mutate_if(dt,a==1 | a==2,x=400)

  mutate_if(group_by(dt,a),a==1 | a==2,x=mean(b))

  # Quite inefficient implementation
  mutate_if(group_by(df,a),a==1 | a==2,x=mean(b))
  
  
  # Small benchmark
  n = 1e6
  df = data.frame(a= sample(1:3,n,replace=TRUE),
                   b= sample(1:100,n,replace=TRUE),
                   x=rnorm(n))
  
  microbenchmark(times = 5L,
    mutate(df, x=ifelse(a==2,x+100,x)),
    mutate_if(df, a==2, x=x+100)
  )
 #Unit: milliseconds
 #                                       expr      min       lq    median        uq       max neval
 # mutate(df, x = ifelse(a == 2, x + 100, x)) 749.2954 754.4179 815.06681 820.95872 860.79326     5
 #         mutate_if(df, a == 2, x = x + 100)  72.2886  75.4189  77.47787  83.64689  86.33666     5
 }

	# Tools to make it run
	deparse_all <- function(x) {
	deparse2 <- function(x) paste(deparse(x, width.cutoff = 500L), collapse = "")
	vapply(x, deparse2, FUN.VALUE = character(1))
	}

	dt_env <- function(dt, env) {
	env <- new.env(parent = env, size = 2L)
	env$dt <- dt
	env$vars <- deparse_all(groups(dt))

	env
	}


	# sugessted change of in manip.r

	#' Data manipulation functions.
	#'
	#' These five functions form the backbone of dplyr. They are all S3 generic
	#' functions with methods for each individual data type. All functions work
	#' exactly the same way: the first argument is the tbl, and the
	#' subsequence arguments are interpreted in the context of that tbl.
	#'
	#' @section Manipulation functions:
	#'
	#' The five key data manipulation functions are:
	#'
	#' \itemize{
	#' \item filter: return only a subset of the rows. If multiple conditions are
	#' supplied they are combined with \code{&}.
	#' \item select: return only a subset of the columns. If multiple columns are
	#' supplied they are all used.
	#' \item arrange: reorder the rows. Multiple inputs are ordered from left-to-
	#' right.
	#' \item mutate: add new columns or replace existing columns. Multiple inputs create multiple columns.
	#' \item mutate_if: replace selected rows of existing columns.
	#' \item summarise: reduce each group to a single row. Multiple inputs create
	#' multiple output summaries.
	#' }
	#'
	#' These are all made significantly more useful when applied by group,
	#' as with \code{\link{group_by}}
	#'
	#' @section Tbls:
	#'
	#' dplyr comes with three built-in tbls. Read the help for the
	#' manip methods of that class to get more details:
	#'
	#' \itemize{
	#' \item data.frame: \link{manip_df}
	#' \item data.table: \link{manip_dt}
	#' \item SQLite: \code{\link{src_sqlite}}
	#' \item PostgreSQL: \code{\link{src_postgres}}
	#' \item MySQL: \code{\link{src_mysql}}
	#' }
	#'
	#' @section Output:
	#'
	#' Generally, manipulation functions will return an output object of the
	#' same type as their input. The exceptions are:
	#'
	#' \itemize{
	#' \item \code{summarise} will return an ungrouped source
	#' \item remote sources (like databases) will typically return a local
	#' source from at least \code{summarise} and \code{mutate}
	#' }
	#'
	#' @section Row names:
	#'
	#' dplyr methods do not preserve row names. If have been using row names
	#' to store important information, please make them explicit variables.
	#'
	#' @name manip
	#' @param .data a tbl
	#' @param ... variables interpreted in the context of that data frame.
	#' @examples
	#' filter(mtcars, cyl == 8)
	#' select(mtcars, mpg, cyl, hp:vs)
	#' arrange(mtcars, cyl, disp)
	#' mutate(mtcars, displ_l = disp / 61.0237)
	#' mutate_if(mtcars,cyl==8, displ_l = disp / 61.0237)
	#' summarise(mtcars, mean(disp))
	#' summarise(group_by(mtcars, cyl), mean(disp))
	NULL


	# code for manip.r

	#' @rdname manip
	#' @export
	mutate_if = function (.data,.if,...) {
	UseMethod("mutate_if")
	}

	# for tbl-data.frame.R

	#' @rdname manip_df
	#' @export
	mutate_if.data.frame =function (.data,.if,...)
	{
	dt = as.data.table(.data)
	.if.quoted = substitute(.if)
	as.data.frame(mutate_if.data.table(.data=dt,.if.quoted=.if.quoted,...,inplace=TRUE, .parent.env = parent.frame()))
	}

	# for manip-df.r

	#' @rdname manip_df
	#' @export
	mutate_if.tbl_df <- function (.data,.if,...) {
	dt = as.data.table(.data)
	.if.quoted = substitute(.if)
	tbl_df(mutate_if.data.table(.data=dt,.if.quoted=.if.quoted,...,inplace=TRUE, .parent.env = parent.frame()))
	}

	#' @export
	mutate.tbl_dt <- function(.data,.if, ...) {
	.if.quoted = substitute(.if)
	tbl_dt(
	mutate_if.data.table(.data=.data,.if.quoted=.if.quoted,...,inplace=TRUE, .parent.env = parent.frame())
	)
	}

	# for manip-dt.r

	#' @rdname manip_dt
	#' @export
	mutate_if.data.table <- function (.data,.if, ..., inplace = FALSE,.if.quoted=NULL, .parent.env=parent.frame())
	{
	if (is.null(.if.quoted))
	.if.quoted = substitute(.if)

	if (!inplace)
	.data <- copy(.data)
	env <- new.env(parent = .parent.env, size = 1L)
	env$data <- .data
	cols <- named_dots(...)

	for (i in seq_along(cols)) {
	call <- substitute(data[.if.quoted, `:=`(lhs, rhs)], list(lhs = as.name(names(cols)[[i]]), rhs = cols[[i]], .if.quoted =.if.quoted))
	eval(call, env)
	}
	.data
	}



	# for manip-grouped-dt.r

	#' @rdname manip_grouped_dt
	#' @export
	mutate_if.grouped_dt <- function(.data,.if, ..., inplace = FALSE, .if.quoted=NULL) {
	data <- .data
	if (is.null(.if.quoted))
	.if.quoted = substitute(.if)
	if (!inplace) data <- copy(data)

	env <- dt_env(data, parent.frame())
	cols <- named_dots(...)
	# For each new variable, generate a call of the form df[, new := expr]
	for(col in names(cols)) {
	call <- substitute(dt[.if.quoted, lhs := rhs, by = vars],
	list(lhs = as.name(col), rhs = cols[[col]], .if.quoted=.if.quoted))
	eval(call, env)
	}

	grouped_dt(
	data = data,
	vars = groups(.data)
	)
	}

	#' @rdname manip_grouped_df
	#' @export
	mutate_if.grouped_df <- function(.data,.if, ...) {
	# This function is currently extremely unelegant and inefficient
	# Problem: when transforming to data.table row order will be changed
	# by group_by operation at least in dplyr 0.1.3
	# So I manually restore the original row order

	if (NROW(.data)==0)
	return(.data)
	.if.quoted = substitute(.if)
	vars = groups(.data)
	dt = as.data.table(.data)
	class(dt) = c("data.table","data.frame")
	mutate(dt, INDEX.ROW__ = 1:NROW(.data), inplace=TRUE)
	gdt = grouped_dt(dt, vars=vars)
	gdt = mutate_if.grouped_dt(gdt,.if.quoted=.if.quoted,..., inplace=TRUE)
	data = dplyr:::grouped_df(data=as.data.frame(gdt), vars=vars)
	# restore original order
	data = select(arrange(data, INDEX.ROW__), -INDEX.ROW__)
	data

	}



	examples = function() {
	library(microbenchmark)
	#library(modify)
	library(dplyr)
	library(pryr)
	library(data.table)

	# Benckmark compared to directly using data.table or dplyr
	set.seed(123456)
	n = 1e1
	df = data.frame(a= sample(1:3,n,replace=TRUE),
	b= sample(1:100,n,replace=TRUE),
	x=rnorm(n))
	dt = as.data.table(df)
	mutate_if(df,a==3,x=100)
	mutate_if(tbl_df(df),a==1,x=200)
	mutate_if(as.tbl(df),a==1,x=300,b=400)
	mutate_if(dt,a==1 \| a==2,x=400)

	mutate_if(group_by(dt,a),a==1 \| a==2,x=mean(b))

	# Quite inefficient implementation
	mutate_if(group_by(df,a),a==1 \| a==2,x=mean(b))


	# Small benchmark
	n = 1e6
	df = data.frame(a= sample(1:3,n,replace=TRUE),
	b= sample(1:100,n,replace=TRUE),
	x=rnorm(n))

	microbenchmark(times = 5L,
	mutate(df, x=ifelse(a==2,x+100,x)),
	mutate_if(df, a==2, x=x+100)
	)
	#Unit: milliseconds
	# expr min lq median uq max neval
	# mutate(df, x = ifelse(a == 2, x + 100, x)) 749.2954 754.4179 815.06681 820.95872 860.79326 5
	# mutate_if(df, a == 2, x = x + 100) 72.2886 75.4189 77.47787 83.64689 86.33666 5
	}