Last active
January 1, 2016 14:39
-
-
Save arunsrinivasan/8159395 to your computer and use it in GitHub Desktop.
cheatsheet
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # here's some sample data to test it out | |
| require(data.table) | |
| require(dplyr) | |
| set.seed(45) | |
| DF <- data.frame(x=sample(3, 25, TRUE), y=1:25, z=26:50) | |
| DP <- tbl_df(DF) # for DPLYR data.frame object | |
| DT <- data.table(DF) | |
| # 1) row-wise subset (usually based on conditions): | |
| # 1a) base | |
| DF[DF$y >= 15L & DF$z >= 45L, ] | |
| # alternatively | |
| DF[with(DF, y >= 15L & z >= 45L), ] | |
| # 1b) dplyr | |
| filter(DP, y >= 15L & z >= 45L) | |
| # 1c) data.table | |
| DT[y >= 15L & z >= 45L] | |
| # 2) column-wise subset | |
| # 2a) base | |
| DF[, "z", drop=FALSE] # returns 1-col data.frame | |
| DF[, c("x", "y")] | |
| # 2b) dplyr | |
| select(DP, z) | |
| select(DP, x, y) | |
| # 2c) data.table | |
| DT[, list(z)] | |
| DT[, list(x, y)] | |
| # or using column names the data.frame way | |
| DT[, "z", with=FALSE] | |
| DT[, c("x", "y"), with=FALSE] | |
| # 3) adding columns without any aggregtation | |
| # 3a) base | |
| DF <- transform(DF, a = 1L, b = x+1L) # adding 1 column | |
| # 3b) dplyr | |
| DP <- mutate(DP, a = 1L, b = x+1L) | |
| # 3c) data.table (no need to assign to another object, modifies by reference) | |
| DT[, `:=`(a = 1L, b = x+1L)] | |
| # adding only 1 column can also be done as: DT[, a := 1L] | |
| # 4) adding a column with aggregated result with aggregated result repeated for each group | |
| # 4a) base | |
| DF <- transform(DF, agg = with(DF, ave(a, x, FUN=sum))) | |
| # 4b) dplyr | |
| DP <- DP %.% group_by(x) %.% mutate(agg = sum(a)) | |
| # 4c) data.table (no need to assign to another object, modifies by reference) | |
| DT[, agg := sum(a), by = x] | |
| # 5) ordering by columns | |
| # 5a) base | |
| DF[with(DF, order(x, -y)), ] | |
| # 5b) dplyr | |
| arrange(DP, x, -y) | |
| # 5c) data.table | |
| DT[order(x, -y)] # will be optimised using internal fastorder (FR #2405) when getOption(datatable.optimize) > 0L | |
| # 6) Assigning values to columns for subset of rows | |
| # 6a) base | |
| DF[DF$agg == 11L, c("a", "b")] <- 0L | |
| # 6b) dplyr | |
| DP[DP$agg == 11L, c("a", "b")] <- 0L # (no dplyr specific way AFAIK) | |
| # 6c) data.table (no need to assign to another object, modifies by reference) | |
| DT[agg == 11L, c("a", "b") := 0L] | |
| # or | |
| DT[agg == 11L, `:=`(a=0L, b=0L)] | |
| # 7) summarising/aggregation with aggregated result *not* repeated for each group | |
| # 7a) base | |
| agg.df <- aggregate(data=DF, as.matrix(DF[, -1]) ~ x, sum) | |
| # 7b) dplyr | |
| agg.dp <- group_by(DP, x) %.% summarise(y=sum(y), z=sum(z), a=sum(a), b=sum(b), agg=sum(agg)) | |
| # I am not aware of a way to group-summarise | |
| # 7c) data.table | |
| agg.dt <- DT[, list(y=sum(y), z=sum(z), a=sum(a), b=sum(b), agg=sum(agg)), by=x] | |
| # imagine you've a 100 columns.. this becomes tedious.. you can just do: | |
| agg.dt <- DT[, lapply(.SD, sum), by=x] # all columns other than "x" (grouping columns in by=)is in .SD by default | |
| # use .SDcols to select the columns you want to aggregate with '.SD' as follows: | |
| agg.dt <- DT[, lapply(.SD, sum), by=x, .SDcols=c("y", "a")] # only columns "y" and "a" will be in .SD | |
| # joins! | |
| # data sets for base | |
| DF1 <- data.frame(x=1:4, y=5:8) | |
| DF2 <- data.frame(x=3:6, y=10:7) | |
| # data sets for dplyr | |
| DP1 <- tbl_df(DF1) | |
| DP2 <- tbl_df(DF2) | |
| # data sets for data.table | |
| DT1 <- data.table(DF1) | |
| DT2 <- data.table(DF2) | |
| # 8) left-join | |
| # 8a) base | |
| ans <- merge(DF1, DF2, all.x=TRUE, by="x") | |
| # 8b) dplyr | |
| ans <- left_join(DP1, DP2, by="x") | |
| # 8c) data.table | |
| # merge way | |
| ans <- merge(DT1, DT2, by="x", all.x=TRUE) | |
| # alternatively using the idiomatic DT way | |
| setkey(DT1, "x") | |
| setkey(DT2, "x") | |
| DT2[DT1] | |
| # 9) right-join | |
| # 9a) base | |
| ans <- merge(DF1, DF2, all.y=TRUE, by="x") | |
| # 9b) dplyr | |
| ans <- right_join(DP1, DP2, by="x") # not implemented yet | |
| # 9c) data.table | |
| # merge way | |
| ans <- merge(DT1, DT2, by="x", all.y=TRUE) | |
| # alternatively using the idiomatic DT way | |
| setkey(DT1, "x") | |
| setkey(DT2, "x") | |
| DT1[DT2] | |
| # 10) inner-join | |
| # 10a) base | |
| ans <- merge(DF1, DF2, by="x") | |
| # 10b) dplyr | |
| ans <- inner_join(DP1, DP2, by="x") | |
| # 10c) data.table | |
| # using 'merge' | |
| ans <- merge(DT1, DT2, by="x") | |
| # alternatively using the idiomatic DT way | |
| setkey(DT1, x) | |
| setkey(DT2, x) | |
| DT2[DT1, nomatch=0L] | |
| # 11) outer-join | |
| # 11a) base | |
| ans <- merge(DF1, DF2, by="x", all=TRUE) | |
| # 11b) dplyr | |
| ans <- outer_join(...) ##?? not implemented yet | |
| # 11c) data.table | |
| ans <- merge(DT1, DT2, by="x", all=TRUE) | |
| # 10c) data.table | |
| # merge way | |
| ans <- merge(DT1, DT2, by="x") | |
| # 12) semi-join | |
| # 12a) base (# one way of doing it) | |
| ans <- merge(DF1, DF2[, "x", drop=FALSE], by="x") | |
| # 12b) dplyr | |
| ans <- semi_join(DP1, DP2, by="x") | |
| # 12c) data.table | |
| # merge way | |
| ans <- merge(DT1, DT2[, "x", with=FALSE], by="x") | |
| # alternatively using the idiomatic DT way | |
| setkey(DT1, x) | |
| setkey(DT2, x) | |
| DT1[DT2[, key(DT2), with=FALSE], nomatch=0L] | |
| # 13) anti-join | |
| # 13a) base (# one way of doing it) | |
| # can't think of a direct way | |
| # 13b) dplyr | |
| ans <- anti_join(DP1, DP2, by="x") | |
| # 13c) data.table | |
| # alternatively using the idiomatic DT way | |
| setkey(DT1, x) | |
| setkey(DT2, x) | |
| DT1[!DT2[, key(DT2), with=FALSE]] | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment