Created
September 21, 2016 21:27
-
-
Save halpo/7809c07664efba522ce31c8d37d7169f to your computer and use it in GitHub Desktop.
spread_each and margins functions which compliment dplyr and tidyr packages.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
margins <- | |
function( grouped #< [grouped_df] A data frame with groups defined. | |
, ... #< passed to `FUN` | |
, FUN=dplyr::summarize #< Summary function | |
, all.name = getOption("margins::all.name", "(All)") | |
){ | |
"Add margins to summarization" | |
g <- groups(grouped) | |
com <- lapply(seq(0, length(g)), combn, x=g, simplify=FALSE) | |
com <- Reduce(c, com) | |
com <- com[order(desc(sapply(com, length)))] | |
l <- | |
lapply(com, function(g){ | |
g2 <- if(length(g)) group_by_(ungroup(grouped), .dots=g) else ungroup(grouped) | |
FUN(g2, ...) | |
}) %>% | |
bind_rows | |
for(v in as.character(g)){ | |
if(v %in% names(l)) | |
l[[v]] <- xifnotNA(as.character(l[[v]]), all.name) | |
else | |
l[[v]] <- all.name | |
} | |
l | |
} | |
with_margins <- | |
function( FUN #< Function to compute one groups and margins. | |
, all.name = getOption("margins::all.name", "(All)") #< name to fill in margins. | |
){ | |
#! Compute marginals | |
#! | |
#! `with_margins` creates another function that performs the given | |
#! `FUN` function over the given groups and all possible marginals | |
#! of the groups, including an overall computation. | |
#! This will most commonly be used with summarise but can also | |
#! be used with other functions. | |
function(.data, ...){ | |
g <- groups(.data) | |
com <- lapply(seq(0, length(g)), combn, x=g, simplify=FALSE) %>% | |
Reduce(c, .) | |
com <- com[order(desc(sapply(com, length)))] | |
lapply(com, function(x, .data){ | |
FUN(group_by_(.data, .dots=x, add=FALSE), ...) %>% | |
mutate_( .dots = | |
structure( replicate(length(setdiff(g, x)), lazyeval::lazy(all.name), simplify=FALSE) | |
, names = as.character(setdiff(g,x)) | |
) | |
) | |
}, .data) %>% | |
bind_rows | |
} | |
} | |
if(FALSE){#! @example | |
.data <- expand.grid( x = c( 'a', 'b', 'c') | |
, y = c( 'd', 'e', 'f') | |
, .rep = 1:10 | |
, stringsAsFactors=FALSE | |
) %>% | |
mutate( v = rnorm(90)) %>% | |
select(-.rep) %>% | |
group_by(x, y) | |
with_margins(summarise)(.data, N=n(), sum=sum(v)) | |
margins(.data, N=n(), sum=sum(v)) | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
spread_each <- | |
function( data #< A <data.frame> or <tbl> | |
, key #< unquoted variable name to make into columns | |
, ... #< (sub-)columns of values | |
, fill=NA #< passed to <spread>. | |
, convert=FALSE #< passed to <spread>. | |
, drop=FALSE #< passed to <spread>. | |
, sep='.' #< Separator that goes between the key value and "..." column names. | |
){ | |
#! wrapper for spread_each_ | |
spread_each_( data | |
, key.col = tidyr:::col_name(substitute(key)) | |
, .dots = lazyeval::lazy_dots(...) | |
, fill=fill, convert=convert, drop=drop, sep=sep | |
) | |
} | |
spread_each_ <- | |
function( data #< A <data.frame> or <tbl> | |
, key #< unquoted variable name to make into columns | |
, ... #< (sub-)columns of values | |
, .dots #< a <lazy_dots> list. | |
, key.col = tidyr:::col_name(substitute(key)) | |
#< the character name of the key column. | |
, fill=NA #< passed to <spread>. | |
, convert=FALSE #< passed to <spread>. | |
, drop=FALSE #< passed to <spread>. | |
, sep='.' #< Separator that goes between the key value and "..." column names. | |
){ | |
#! Spread a key column with multiple sub columns | |
#! | |
#! Creates a <tbl> with a the values of key as columns with | |
#! the variables listed in ... as sub columns. | |
dots <- lazyeval::all_dots(.dots, ...) | |
value.cols <- dplyr:::select_vars_(names(data), dots) | |
grouping.cols <- names(data) %>% setdiff(key.col) %>% setdiff(value.cols) | |
grouping.dots <- grouping.cols %>% | |
(lazyeval::as.lazy_dots)(.) %>% | |
dplyr:::resolve_vars(tbl_vars(data)) | |
data <- group_by_(data, .dots=grouping.dots) | |
f <- function(col){ | |
select.vars <- c(key.col, col, grouping.cols) %>% | |
lapply(as.name) %>% | |
(lazyeval::as.lazy_dots)(.) %>% | |
dplyr:::resolve_vars(tbl_vars(data)) | |
x <- | |
spread_( data = select_(data, .dots=select.vars), | |
, key_col = key.col | |
, value_col = col | |
, fill = fill | |
, sep = NULL | |
) | |
newcols <- setdiff(names(x), grouping.cols) | |
new.names <- | |
structure( newcols %>% lapply(as.name) %>% lapply(lazyeval::as.lazy) | |
, names=paste(newcols, col, sep=sep) | |
) | |
rename_(.data=x, .dots=new.names) | |
} | |
col.order <- | |
unique(getElement(data, key.col)) %>% as.character() %>% | |
lapply(., function(x, env){ | |
lazyeval::as.lazy(call('starts_with', x), env=env) | |
}, env=environment()) %>% | |
c(lapply(grouping.cols, lazyeval::as.lazy, env=environment()), .) | |
lapply(value.cols, f) %>% | |
Reduce(f=full_join, x=.) %>% | |
select_(., .dots=col.order) | |
} | |
if(FALSE){#! @example | |
data <- expand.grid( x = c( 'a', 'b', 'c') | |
, y = c( 'd', 'e', 'f') | |
, .rep = 1:10 | |
, stringsAsFactors=FALSE | |
) %>% | |
mutate( v = rnorm(90)) %>% | |
group_by(x, y) %>% | |
summarise(N=n(), sum=sum(v)) | |
# Data is a data.frame with columns x, y, N, and sum. | |
# Spread column y over columns N and sum | |
spread_each(data, y, N, sum) | |
# creates a tbl_df dataset with columns: | |
# x, d.N, d.sum, e.N, e.sum, f.N, f.sum | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Awesome! That margin function is really missing in dplyr.
2 comments on your code:
mtcars %>% group_by(cyl, gear, carb) %>% summarise(n=n(), mmpg=mean(mpg))
returns a df with 5 columnsmtcars %>% group_by(cyl, gear, carb) %>% summarise(n=n(), mmpg=mean(mpg)) %>% margins(nn=sum(n), mmpg=weighted.mean(mmpg, n))
only returns 4 columns, carb is missing. do you have any idea how to fix this?