Skip to content

Instantly share code, notes, and snippets.

@arunsrinivasan
Last active January 1, 2016 14:39
Show Gist options
  • Select an option

  • Save arunsrinivasan/8159395 to your computer and use it in GitHub Desktop.

Select an option

Save arunsrinivasan/8159395 to your computer and use it in GitHub Desktop.
cheatsheet
# here's some sample data to test it out
require(data.table)
require(dplyr)
set.seed(45)
DF <- data.frame(x=sample(3, 25, TRUE), y=1:25, z=26:50)
DP <- tbl_df(DF) # for DPLYR data.frame object
DT <- data.table(DF)
# 1) row-wise subset (usually based on conditions):
# 1a) base
DF[DF$y >= 15L & DF$z >= 45L, ]
# alternatively
DF[with(DF, y >= 15L & z >= 45L), ]
# 1b) dplyr
filter(DP, y >= 15L & z >= 45L)
# 1c) data.table
DT[y >= 15L & z >= 45L]
# 2) column-wise subset
# 2a) base
DF[, "z", drop=FALSE] # returns 1-col data.frame
DF[, c("x", "y")]
# 2b) dplyr
select(DP, z)
select(DP, x, y)
# 2c) data.table
DT[, list(z)]
DT[, list(x, y)]
# or using column names the data.frame way
DT[, "z", with=FALSE]
DT[, c("x", "y"), with=FALSE]
# 3) adding columns without any aggregtation
# 3a) base
DF <- transform(DF, a = 1L, b = x+1L) # adding 1 column
# 3b) dplyr
DP <- mutate(DP, a = 1L, b = x+1L)
# 3c) data.table (no need to assign to another object, modifies by reference)
DT[, `:=`(a = 1L, b = x+1L)]
# adding only 1 column can also be done as: DT[, a := 1L]
# 4) adding a column with aggregated result with aggregated result repeated for each group
# 4a) base
DF <- transform(DF, agg = with(DF, ave(a, x, FUN=sum)))
# 4b) dplyr
DP <- DP %.% group_by(x) %.% mutate(agg = sum(a))
# 4c) data.table (no need to assign to another object, modifies by reference)
DT[, agg := sum(a), by = x]
# 5) ordering by columns
# 5a) base
DF[with(DF, order(x, -y)), ]
# 5b) dplyr
arrange(DP, x, -y)
# 5c) data.table
DT[order(x, -y)] # will be optimised using internal fastorder (FR #2405) when getOption(datatable.optimize) > 0L
# 6) Assigning values to columns for subset of rows
# 6a) base
DF[DF$agg == 11L, c("a", "b")] <- 0L
# 6b) dplyr
DP[DP$agg == 11L, c("a", "b")] <- 0L # (no dplyr specific way AFAIK)
# 6c) data.table (no need to assign to another object, modifies by reference)
DT[agg == 11L, c("a", "b") := 0L]
# or
DT[agg == 11L, `:=`(a=0L, b=0L)]
# 7) summarising/aggregation with aggregated result *not* repeated for each group
# 7a) base
agg.df <- aggregate(data=DF, as.matrix(DF[, -1]) ~ x, sum)
# 7b) dplyr
agg.dp <- group_by(DP, x) %.% summarise(y=sum(y), z=sum(z), a=sum(a), b=sum(b), agg=sum(agg))
# I am not aware of a way to group-summarise
# 7c) data.table
agg.dt <- DT[, list(y=sum(y), z=sum(z), a=sum(a), b=sum(b), agg=sum(agg)), by=x]
# imagine you've a 100 columns.. this becomes tedious.. you can just do:
agg.dt <- DT[, lapply(.SD, sum), by=x] # all columns other than "x" (grouping columns in by=)is in .SD by default
# use .SDcols to select the columns you want to aggregate with '.SD' as follows:
agg.dt <- DT[, lapply(.SD, sum), by=x, .SDcols=c("y", "a")] # only columns "y" and "a" will be in .SD
# joins!
# data sets for base
DF1 <- data.frame(x=1:4, y=5:8)
DF2 <- data.frame(x=3:6, y=10:7)
# data sets for dplyr
DP1 <- tbl_df(DF1)
DP2 <- tbl_df(DF2)
# data sets for data.table
DT1 <- data.table(DF1)
DT2 <- data.table(DF2)
# 8) left-join
# 8a) base
ans <- merge(DF1, DF2, all.x=TRUE, by="x")
# 8b) dplyr
ans <- left_join(DP1, DP2, by="x")
# 8c) data.table
# merge way
ans <- merge(DT1, DT2, by="x", all.x=TRUE)
# alternatively using the idiomatic DT way
setkey(DT1, "x")
setkey(DT2, "x")
DT2[DT1]
# 9) right-join
# 9a) base
ans <- merge(DF1, DF2, all.y=TRUE, by="x")
# 9b) dplyr
ans <- right_join(DP1, DP2, by="x") # not implemented yet
# 9c) data.table
# merge way
ans <- merge(DT1, DT2, by="x", all.y=TRUE)
# alternatively using the idiomatic DT way
setkey(DT1, "x")
setkey(DT2, "x")
DT1[DT2]
# 10) inner-join
# 10a) base
ans <- merge(DF1, DF2, by="x")
# 10b) dplyr
ans <- inner_join(DP1, DP2, by="x")
# 10c) data.table
# using 'merge'
ans <- merge(DT1, DT2, by="x")
# alternatively using the idiomatic DT way
setkey(DT1, x)
setkey(DT2, x)
DT2[DT1, nomatch=0L]
# 11) outer-join
# 11a) base
ans <- merge(DF1, DF2, by="x", all=TRUE)
# 11b) dplyr
ans <- outer_join(...) ##?? not implemented yet
# 11c) data.table
ans <- merge(DT1, DT2, by="x", all=TRUE)
# 10c) data.table
# merge way
ans <- merge(DT1, DT2, by="x")
# 12) semi-join
# 12a) base (# one way of doing it)
ans <- merge(DF1, DF2[, "x", drop=FALSE], by="x")
# 12b) dplyr
ans <- semi_join(DP1, DP2, by="x")
# 12c) data.table
# merge way
ans <- merge(DT1, DT2[, "x", with=FALSE], by="x")
# alternatively using the idiomatic DT way
setkey(DT1, x)
setkey(DT2, x)
DT1[DT2[, key(DT2), with=FALSE], nomatch=0L]
# 13) anti-join
# 13a) base (# one way of doing it)
# can't think of a direct way
# 13b) dplyr
ans <- anti_join(DP1, DP2, by="x")
# 13c) data.table
# alternatively using the idiomatic DT way
setkey(DT1, x)
setkey(DT2, x)
DT1[!DT2[, key(DT2), with=FALSE]]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment