Created
May 17, 2014 09:10
-
-
Save skranz/b2343e7178a657328f49 to your computer and use it in GitHub Desktop.
mutate_if
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Tools to make it run | |
deparse_all <- function(x) { | |
deparse2 <- function(x) paste(deparse(x, width.cutoff = 500L), collapse = "") | |
vapply(x, deparse2, FUN.VALUE = character(1)) | |
} | |
dt_env <- function(dt, env) { | |
env <- new.env(parent = env, size = 2L) | |
env$dt <- dt | |
env$vars <- deparse_all(groups(dt)) | |
env | |
} | |
# sugessted change of in manip.r | |
#' Data manipulation functions. | |
#' | |
#' These five functions form the backbone of dplyr. They are all S3 generic | |
#' functions with methods for each individual data type. All functions work | |
#' exactly the same way: the first argument is the tbl, and the | |
#' subsequence arguments are interpreted in the context of that tbl. | |
#' | |
#' @section Manipulation functions: | |
#' | |
#' The five key data manipulation functions are: | |
#' | |
#' \itemize{ | |
#' \item filter: return only a subset of the rows. If multiple conditions are | |
#' supplied they are combined with \code{&}. | |
#' \item select: return only a subset of the columns. If multiple columns are | |
#' supplied they are all used. | |
#' \item arrange: reorder the rows. Multiple inputs are ordered from left-to- | |
#' right. | |
#' \item mutate: add new columns or replace existing columns. Multiple inputs create multiple columns. | |
#' \item mutate_if: replace selected rows of existing columns. | |
#' \item summarise: reduce each group to a single row. Multiple inputs create | |
#' multiple output summaries. | |
#' } | |
#' | |
#' These are all made significantly more useful when applied by group, | |
#' as with \code{\link{group_by}} | |
#' | |
#' @section Tbls: | |
#' | |
#' dplyr comes with three built-in tbls. Read the help for the | |
#' manip methods of that class to get more details: | |
#' | |
#' \itemize{ | |
#' \item data.frame: \link{manip_df} | |
#' \item data.table: \link{manip_dt} | |
#' \item SQLite: \code{\link{src_sqlite}} | |
#' \item PostgreSQL: \code{\link{src_postgres}} | |
#' \item MySQL: \code{\link{src_mysql}} | |
#' } | |
#' | |
#' @section Output: | |
#' | |
#' Generally, manipulation functions will return an output object of the | |
#' same type as their input. The exceptions are: | |
#' | |
#' \itemize{ | |
#' \item \code{summarise} will return an ungrouped source | |
#' \item remote sources (like databases) will typically return a local | |
#' source from at least \code{summarise} and \code{mutate} | |
#' } | |
#' | |
#' @section Row names: | |
#' | |
#' dplyr methods do not preserve row names. If have been using row names | |
#' to store important information, please make them explicit variables. | |
#' | |
#' @name manip | |
#' @param .data a tbl | |
#' @param ... variables interpreted in the context of that data frame. | |
#' @examples | |
#' filter(mtcars, cyl == 8) | |
#' select(mtcars, mpg, cyl, hp:vs) | |
#' arrange(mtcars, cyl, disp) | |
#' mutate(mtcars, displ_l = disp / 61.0237) | |
#' mutate_if(mtcars,cyl==8, displ_l = disp / 61.0237) | |
#' summarise(mtcars, mean(disp)) | |
#' summarise(group_by(mtcars, cyl), mean(disp)) | |
NULL | |
# code for manip.r | |
#' @rdname manip | |
#' @export | |
mutate_if = function (.data,.if,...) { | |
UseMethod("mutate_if") | |
} | |
# for tbl-data.frame.R | |
#' @rdname manip_df | |
#' @export | |
mutate_if.data.frame =function (.data,.if,...) | |
{ | |
dt = as.data.table(.data) | |
.if.quoted = substitute(.if) | |
as.data.frame(mutate_if.data.table(.data=dt,.if.quoted=.if.quoted,...,inplace=TRUE, .parent.env = parent.frame())) | |
} | |
# for manip-df.r | |
#' @rdname manip_df | |
#' @export | |
mutate_if.tbl_df <- function (.data,.if,...) { | |
dt = as.data.table(.data) | |
.if.quoted = substitute(.if) | |
tbl_df(mutate_if.data.table(.data=dt,.if.quoted=.if.quoted,...,inplace=TRUE, .parent.env = parent.frame())) | |
} | |
#' @export | |
mutate.tbl_dt <- function(.data,.if, ...) { | |
.if.quoted = substitute(.if) | |
tbl_dt( | |
mutate_if.data.table(.data=.data,.if.quoted=.if.quoted,...,inplace=TRUE, .parent.env = parent.frame()) | |
) | |
} | |
# for manip-dt.r | |
#' @rdname manip_dt | |
#' @export | |
mutate_if.data.table <- function (.data,.if, ..., inplace = FALSE,.if.quoted=NULL, .parent.env=parent.frame()) | |
{ | |
if (is.null(.if.quoted)) | |
.if.quoted = substitute(.if) | |
if (!inplace) | |
.data <- copy(.data) | |
env <- new.env(parent = .parent.env, size = 1L) | |
env$data <- .data | |
cols <- named_dots(...) | |
for (i in seq_along(cols)) { | |
call <- substitute(data[.if.quoted, `:=`(lhs, rhs)], list(lhs = as.name(names(cols)[[i]]), rhs = cols[[i]], .if.quoted =.if.quoted)) | |
eval(call, env) | |
} | |
.data | |
} | |
# for manip-grouped-dt.r | |
#' @rdname manip_grouped_dt | |
#' @export | |
mutate_if.grouped_dt <- function(.data,.if, ..., inplace = FALSE, .if.quoted=NULL) { | |
data <- .data | |
if (is.null(.if.quoted)) | |
.if.quoted = substitute(.if) | |
if (!inplace) data <- copy(data) | |
env <- dt_env(data, parent.frame()) | |
cols <- named_dots(...) | |
# For each new variable, generate a call of the form df[, new := expr] | |
for(col in names(cols)) { | |
call <- substitute(dt[.if.quoted, lhs := rhs, by = vars], | |
list(lhs = as.name(col), rhs = cols[[col]], .if.quoted=.if.quoted)) | |
eval(call, env) | |
} | |
grouped_dt( | |
data = data, | |
vars = groups(.data) | |
) | |
} | |
#' @rdname manip_grouped_df | |
#' @export | |
mutate_if.grouped_df <- function(.data,.if, ...) { | |
# This function is currently extremely unelegant and inefficient | |
# Problem: when transforming to data.table row order will be changed | |
# by group_by operation at least in dplyr 0.1.3 | |
# So I manually restore the original row order | |
if (NROW(.data)==0) | |
return(.data) | |
.if.quoted = substitute(.if) | |
vars = groups(.data) | |
dt = as.data.table(.data) | |
class(dt) = c("data.table","data.frame") | |
mutate(dt, INDEX.ROW__ = 1:NROW(.data), inplace=TRUE) | |
gdt = grouped_dt(dt, vars=vars) | |
gdt = mutate_if.grouped_dt(gdt,.if.quoted=.if.quoted,..., inplace=TRUE) | |
data = dplyr:::grouped_df(data=as.data.frame(gdt), vars=vars) | |
# restore original order | |
data = select(arrange(data, INDEX.ROW__), -INDEX.ROW__) | |
data | |
} | |
examples = function() { | |
library(microbenchmark) | |
#library(modify) | |
library(dplyr) | |
library(pryr) | |
library(data.table) | |
# Benckmark compared to directly using data.table or dplyr | |
set.seed(123456) | |
n = 1e1 | |
df = data.frame(a= sample(1:3,n,replace=TRUE), | |
b= sample(1:100,n,replace=TRUE), | |
x=rnorm(n)) | |
dt = as.data.table(df) | |
mutate_if(df,a==3,x=100) | |
mutate_if(tbl_df(df),a==1,x=200) | |
mutate_if(as.tbl(df),a==1,x=300,b=400) | |
mutate_if(dt,a==1 | a==2,x=400) | |
mutate_if(group_by(dt,a),a==1 | a==2,x=mean(b)) | |
# Quite inefficient implementation | |
mutate_if(group_by(df,a),a==1 | a==2,x=mean(b)) | |
# Small benchmark | |
n = 1e6 | |
df = data.frame(a= sample(1:3,n,replace=TRUE), | |
b= sample(1:100,n,replace=TRUE), | |
x=rnorm(n)) | |
microbenchmark(times = 5L, | |
mutate(df, x=ifelse(a==2,x+100,x)), | |
mutate_if(df, a==2, x=x+100) | |
) | |
#Unit: milliseconds | |
# expr min lq median uq max neval | |
# mutate(df, x = ifelse(a == 2, x + 100, x)) 749.2954 754.4179 815.06681 820.95872 860.79326 5 | |
# mutate_if(df, a == 2, x = x + 100) 72.2886 75.4189 77.47787 83.64689 86.33666 5 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment