Skip to content

Instantly share code, notes, and snippets.

@arunsrinivasan
Last active January 4, 2016 12:49
Show Gist options
  • Select an option

  • Save arunsrinivasan/8623968 to your computer and use it in GitHub Desktop.

Select an option

Save arunsrinivasan/8623968 to your computer and use it in GitHub Desktop.
min_rank vs min - Hadley's "premature optimisation" point
require(dplyr)
require(data.table)
foo <- function(N) {
group_sizes = 10^(1:(log10(N)-1L))
uniqval <- unique(runif(2*N))
fans <- vector("list", length(group_sizes))
for (i in seq_along(group_sizes)) {
print(group_sizes[i])
set.seed(1L)
DF <- group_by(
tbl_df(
data.frame(
ID = sample(rep(1:group_sizes[i], (N/group_sizes[i]))),
val = sample(uniqval, N, replace=FALSE)
)
), ID
)
ans = rep(0, 2)
ans[1] <- system.time(filter(DF, min_rank(val) == 1L))['elapsed']
ans[2] <- system.time(filter(DF, val==min(val)))['elapsed']
fans[[i]] = ans
rm(DF); gc()
}
fans <- as.data.table(do.call(rbind, fans))
setnames(fans, c('min_rank', 'min'))
}
N <- 10^(5:7)
ans <- lapply(N, foo)
> ans
[[1]] # N=1e5
min_rank min
1: 0.102 0.004 # group_size=10
2: 0.087 0.004 # =100
3: 0.088 0.006 # =1000
4: 0.110 0.025 # =10000
[[2]] # N=1e6
min_rank min
1: 1.728 0.068 # =10
2: 1.005 0.076 # =100
3: 0.905 0.079 # =1000
4: 0.885 0.099 # =10000
5: 1.086 0.368 # =100000
[[3]] # N=1e7
min_rank min
1: 27.832 0.676 # =10
2: 16.975 1.093 # =100
3: 10.180 0.874 # =1000
4: 9.107 0.971 # =10000
5: 8.963 1.180 # =100000
6: 11.776 2.961 # =1000000
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment