arunsrinivasan · January 1, 2016 14:39
diff --git a/base_dplyr_datatable.R b/base_dplyr_datatable.R
 # here's some sample data to test it out
 require(data.table)
 require(dplyr)
 set.seed(45)
 DF <- data.frame(x=sample(3, 25, TRUE), y=1:25, z=26:50)
 DP <- tbl_df(DF) # for DPLYR data.frame object
 DT <- data.table(DF)

 # 1) row-wise subset (usually based on conditions):

 # 1a) base
 DF[DF$y >= 15L & DF$z >= 45L, ]
 # alternatively
 DF[with(DF, y >= 15L & z >= 45L), ]
 # 1b) dplyr
 filter(DP, y >= 15L & z >= 45L)
 # 1c) data.table
 DT[y >= 15L & z >= 45L]

 # 2) column-wise subset

 # 2a) base
 DF[, "z", drop=FALSE] # returns 1-col data.frame
 DF[, c("x", "y")]

 # 2b) dplyr
 select(DP, z)
 select(DP, x, y)

 # 2c) data.table
 DT[, list(z)]
 DT[, list(x, y)]

 # or using column names the data.frame way
 DT[, "z", with=FALSE]
 DT[, c("x", "y"), with=FALSE]

 #  3) adding columns without any aggregtation

 # 3a) base
 DF <- transform(DF, a = 1L, b = x+1L) # adding 1 column

 # 3b) dplyr
 DP <- mutate(DP, a = 1L, b = x+1L)

 # 3c) data.table (no need to assign to another object, modifies by reference)
 DT[, `:=`(a = 1L, b = x+1L)]
 # adding only 1 column can also be done as: DT[, a := 1L]

 # 4) adding a column with aggregated result with aggregated result repeated for each group

 # 4a) base
 DF <- transform(DF, agg = with(DF, ave(a, x, FUN=sum)))

 # 4b) dplyr
 DP <- DP %.% group_by(x) %.% mutate(agg = sum(a))

 # 4c) data.table (no need to assign to another object, modifies by reference)
 DT[, agg := sum(a), by = x]

 # 5) ordering by columns

 # 5a) base
 DF[with(DF, order(x, -y)), ]

 # 5b) dplyr
 arrange(DP, x, -y)

 # 5c) data.table
 DT[order(x, -y)] # will be optimised using internal fastorder (FR #2405) when getOption(datatable.optimize) > 0L

 # 6) Assigning values to columns for subset of rows

 # 6a) base
 DF[DF$agg == 11L, c("a", "b")] <- 0L

 # 6b) dplyr
 DP[DP$agg == 11L, c("a", "b")] <- 0L # (no dplyr specific way AFAIK)

 # 6c) data.table (no need to assign to another object, modifies by reference)
 DT[agg == 11L, c("a", "b") := 0L]
 # or 
 DT[agg == 11L, `:=`(a=0L, b=0L)]

 # 7) summarising/aggregation with aggregated result *not* repeated for each group

 # 7a) base
 agg.df <- aggregate(data=DF, as.matrix(DF[, -1]) ~ x, sum)

 # 7b) dplyr
 agg.dp <- group_by(DP, x) %.% summarise(y=sum(y), z=sum(z), a=sum(a), b=sum(b), agg=sum(agg))
 # I am not aware of a way to group-summarise

 # 7c) data.table
 agg.dt <- DT[, list(y=sum(y), z=sum(z), a=sum(a), b=sum(b), agg=sum(agg)), by=x]
 # imagine you've a 100 columns.. this becomes tedious.. you can just do:
 agg.dt <- DT[, lapply(.SD, sum), by=x] # all columns other than "x" (grouping columns in by=)is in .SD by default
 # use .SDcols to select the columns you want to aggregate with '.SD' as follows:
 agg.dt <- DT[, lapply(.SD, sum), by=x, .SDcols=c("y", "a")] # only columns "y" and "a" will be in .SD

 # joins!
 # data sets for base
 DF1 <- data.frame(x=1:4, y=5:8)
 DF2 <- data.frame(x=3:6, y=10:7)
 # data sets for dplyr
 DP1 <- tbl_df(DF1)
 DP2 <- tbl_df(DF2)
 # data sets for data.table
 DT1 <- data.table(DF1)
 DT2 <- data.table(DF2)

 # 8) left-join

 # 8a) base
 ans <- merge(DF1, DF2, all.x=TRUE, by="x")

 # 8b) dplyr
 ans <- left_join(DP1, DP2, by="x")

 # 8c) data.table

 # merge way
 ans <- merge(DT1, DT2, by="x", all.x=TRUE)

 # alternatively using the idiomatic DT way
 setkey(DT1, "x")
 setkey(DT2, "x")
 DT2[DT1]

 # 9) right-join

 # 9a) base
 ans <- merge(DF1, DF2, all.y=TRUE, by="x")

 # 9b) dplyr
 ans <- right_join(DP1, DP2, by="x") # not implemented yet

 # 9c) data.table

 # merge way
 ans <- merge(DT1, DT2, by="x", all.y=TRUE)

 # alternatively using the idiomatic DT way
 setkey(DT1, "x")
 setkey(DT2, "x")
 DT1[DT2]

 # 10) inner-join

 # 10a) base
 ans <- merge(DF1, DF2, by="x")

 # 10b) dplyr
 ans <- inner_join(DP1, DP2, by="x")

 # 10c) data.table
 # using 'merge'
 ans <- merge(DT1, DT2, by="x")

 # alternatively using the idiomatic DT way
 setkey(DT1, x)
 setkey(DT2, x)
 DT2[DT1, nomatch=0L]

 # 11) outer-join

 # 11a) base
 ans <- merge(DF1, DF2, by="x", all=TRUE)

 # 11b) dplyr
 ans <- outer_join(...) ##?? not implemented yet

 # 11c) data.table
 ans <- merge(DT1, DT2, by="x", all=TRUE)

 # 10c) data.table
 # merge way
 ans <- merge(DT1, DT2, by="x")

 # 12) semi-join
 # 12a) base (# one way of doing it)
 ans <- merge(DF1, DF2[, "x", drop=FALSE], by="x")

 # 12b) dplyr
 ans <- semi_join(DP1, DP2, by="x")

 # 12c) data.table

 # merge way
 ans <- merge(DT1, DT2[, "x", with=FALSE], by="x")

 # alternatively using the idiomatic DT way
 setkey(DT1, x)
 setkey(DT2, x)
 DT1[DT2[, key(DT2), with=FALSE], nomatch=0L]

 # 13) anti-join
 # 13a) base (# one way of doing it)
 # can't think of a direct way

 # 13b) dplyr
 ans <- anti_join(DP1, DP2, by="x")

 # 13c) data.table

 # alternatively using the idiomatic DT way
 setkey(DT1, x)
 setkey(DT2, x)
 DT1[!DT2[, key(DT2), with=FALSE]]
	# here's some sample data to test it out
	require(data.table)
	require(dplyr)
	set.seed(45)
	DF <- data.frame(x=sample(3, 25, TRUE), y=1:25, z=26:50)
	DP <- tbl_df(DF) # for DPLYR data.frame object
	DT <- data.table(DF)

	# 1) row-wise subset (usually based on conditions):

	# 1a) base
	DF[DF$y >= 15L & DF$z >= 45L, ]
	# alternatively
	DF[with(DF, y >= 15L & z >= 45L), ]
	# 1b) dplyr
	filter(DP, y >= 15L & z >= 45L)
	# 1c) data.table
	DT[y >= 15L & z >= 45L]

	# 2) column-wise subset

	# 2a) base
	DF[, "z", drop=FALSE] # returns 1-col data.frame
	DF[, c("x", "y")]

	# 2b) dplyr
	select(DP, z)
	select(DP, x, y)

	# 2c) data.table
	DT[, list(z)]
	DT[, list(x, y)]

	# or using column names the data.frame way
	DT[, "z", with=FALSE]
	DT[, c("x", "y"), with=FALSE]

	# 3) adding columns without any aggregtation

	# 3a) base
	DF <- transform(DF, a = 1L, b = x+1L) # adding 1 column

	# 3b) dplyr
	DP <- mutate(DP, a = 1L, b = x+1L)

	# 3c) data.table (no need to assign to another object, modifies by reference)
	DT[, `:=`(a = 1L, b = x+1L)]
	# adding only 1 column can also be done as: DT[, a := 1L]

	# 4) adding a column with aggregated result with aggregated result repeated for each group

	# 4a) base
	DF <- transform(DF, agg = with(DF, ave(a, x, FUN=sum)))

	# 4b) dplyr
	DP <- DP %.% group_by(x) %.% mutate(agg = sum(a))

	# 4c) data.table (no need to assign to another object, modifies by reference)
	DT[, agg := sum(a), by = x]

	# 5) ordering by columns

	# 5a) base
	DF[with(DF, order(x, -y)), ]

	# 5b) dplyr
	arrange(DP, x, -y)

	# 5c) data.table
	DT[order(x, -y)] # will be optimised using internal fastorder (FR #2405) when getOption(datatable.optimize) > 0L

	# 6) Assigning values to columns for subset of rows

	# 6a) base
	DF[DF$agg == 11L, c("a", "b")] <- 0L

	# 6b) dplyr
	DP[DP$agg == 11L, c("a", "b")] <- 0L # (no dplyr specific way AFAIK)

	# 6c) data.table (no need to assign to another object, modifies by reference)
	DT[agg == 11L, c("a", "b") := 0L]
	# or
	DT[agg == 11L, `:=`(a=0L, b=0L)]

	# 7) summarising/aggregation with aggregated result not repeated for each group

	# 7a) base
	agg.df <- aggregate(data=DF, as.matrix(DF[, -1]) ~ x, sum)

	# 7b) dplyr
	agg.dp <- group_by(DP, x) %.% summarise(y=sum(y), z=sum(z), a=sum(a), b=sum(b), agg=sum(agg))
	# I am not aware of a way to group-summarise

	# 7c) data.table
	agg.dt <- DT[, list(y=sum(y), z=sum(z), a=sum(a), b=sum(b), agg=sum(agg)), by=x]
	# imagine you've a 100 columns.. this becomes tedious.. you can just do:
	agg.dt <- DT[, lapply(.SD, sum), by=x] # all columns other than "x" (grouping columns in by=)is in .SD by default
	# use .SDcols to select the columns you want to aggregate with '.SD' as follows:
	agg.dt <- DT[, lapply(.SD, sum), by=x, .SDcols=c("y", "a")] # only columns "y" and "a" will be in .SD

	# joins!
	# data sets for base
	DF1 <- data.frame(x=1:4, y=5:8)
	DF2 <- data.frame(x=3:6, y=10:7)
	# data sets for dplyr
	DP1 <- tbl_df(DF1)
	DP2 <- tbl_df(DF2)
	# data sets for data.table
	DT1 <- data.table(DF1)
	DT2 <- data.table(DF2)

	# 8) left-join

	# 8a) base
	ans <- merge(DF1, DF2, all.x=TRUE, by="x")

	# 8b) dplyr
	ans <- left_join(DP1, DP2, by="x")

	# 8c) data.table

	# merge way
	ans <- merge(DT1, DT2, by="x", all.x=TRUE)

	# alternatively using the idiomatic DT way
	setkey(DT1, "x")
	setkey(DT2, "x")
	DT2[DT1]

	# 9) right-join

	# 9a) base
	ans <- merge(DF1, DF2, all.y=TRUE, by="x")

	# 9b) dplyr
	ans <- right_join(DP1, DP2, by="x") # not implemented yet

	# 9c) data.table

	# merge way
	ans <- merge(DT1, DT2, by="x", all.y=TRUE)

	# alternatively using the idiomatic DT way
	setkey(DT1, "x")
	setkey(DT2, "x")
	DT1[DT2]

	# 10) inner-join

	# 10a) base
	ans <- merge(DF1, DF2, by="x")

	# 10b) dplyr
	ans <- inner_join(DP1, DP2, by="x")

	# 10c) data.table
	# using 'merge'
	ans <- merge(DT1, DT2, by="x")

	# alternatively using the idiomatic DT way
	setkey(DT1, x)
	setkey(DT2, x)
	DT2[DT1, nomatch=0L]

	# 11) outer-join

	# 11a) base
	ans <- merge(DF1, DF2, by="x", all=TRUE)

	# 11b) dplyr
	ans <- outer_join(...) ##?? not implemented yet

	# 11c) data.table
	ans <- merge(DT1, DT2, by="x", all=TRUE)

	# 10c) data.table
	# merge way
	ans <- merge(DT1, DT2, by="x")

	# 12) semi-join
	# 12a) base (# one way of doing it)
	ans <- merge(DF1, DF2[, "x", drop=FALSE], by="x")

	# 12b) dplyr
	ans <- semi_join(DP1, DP2, by="x")

	# 12c) data.table

	# merge way
	ans <- merge(DT1, DT2[, "x", with=FALSE], by="x")

	# alternatively using the idiomatic DT way
	setkey(DT1, x)
	setkey(DT2, x)
	DT1[DT2[, key(DT2), with=FALSE], nomatch=0L]

	# 13) anti-join
	# 13a) base (# one way of doing it)
	# can't think of a direct way

	# 13b) dplyr
	ans <- anti_join(DP1, DP2, by="x")

	# 13c) data.table

	# alternatively using the idiomatic DT way
	setkey(DT1, x)
	setkey(DT2, x)
	DT1[!DT2[, key(DT2), with=FALSE]]
No results found