Skip to content

Instantly share code, notes, and snippets.

@jangorecki
Last active June 10, 2020 23:54
Show Gist options
  • Save jangorecki/cf2ad1a01e7f1493a4bd3ef4444e1cbc to your computer and use it in GitHub Desktop.
Save jangorecki/cf2ad1a01e7f1493a4bd3ef4444e1cbc to your computer and use it in GitHub Desktop.
sort merge benchmark
ssa = function(unq_n, size, sort=FALSE) {
if (unq_n > size) return(sample.int(unq_n, size))
unq_sub = seq_len(unq_n)
ans = sample(c(unq_sub, sample(unq_sub, size=max(size-unq_n, 0), replace=TRUE)))
if (sort) sort(ans) else ans
}
set.seed(108)
library(data.table)
options(width=200)
options(datatable.auto.index=FALSE, datatable.verbose=FALSE) ## not needed but just to be future proof if forder will setindex
N = 1e9L
## unsorted no duplicates
d1 = data.table(x=ssa(N, N))[, "v1":=seq_len(.N)]
d2 = data.table(y=ssa(N, N))[, "v2":=seq_len(.N)]
#d1 = data.table(x=ssa(N-1L, N))[, "v1":=seq_len(.N)] ## unsorted single duplicate
#d2 = data.table(y=ssa(N-1L, N))[, "v2":=seq_len(.N)]
setDTthreads(1L) ## single thread no index
options(datatable.smerge=FALSE)
system.time(b <- d1[d2, on="x==y"])
options(datatable.smerge=TRUE)
system.time(s <- d1[d2, on="x==y"])
all.equal(b, s)
setDTthreads(40L) ## all threads no index
options(datatable.smerge=FALSE)
system.time(b <- d1[d2, on="x==y"])
options(datatable.smerge=TRUE)
system.time(s <- d1[d2, on="x==y"])
all.equal(b, s)
setDTthreads(40L) ## all threads index
setindexv2 = function(x, cols) { ## pretend we are after #4386
stopifnot(is.data.table(x), is.character(cols))
if (is.null(attr(x, "index", TRUE))) setattr(x, "index", integer())
setattr(attr(x, "index", TRUE), paste0("__", cols, collapse="__"), data.table:::forderv(x, cols, retGrp=TRUE))
invisible(x)
}
setindexv2(d1, "x"); setindexv2(d2, "y")
options(datatable.smerge=FALSE)
system.time(b <- d1[d2, on="x==y"])
options(datatable.smerge=TRUE)
system.time(s <- d1[d2, on="x==y"])
all.equal(b, s)
setkeyv(d1, "x"); setkeyv(d2, "y"); ## all threads sorted index
setindexv2(d1, "x"); setindexv2(d2, "y")
options(datatable.smerge=FALSE)
system.time(b <- d1[d2, on="x==y"])
options(datatable.smerge=TRUE)
system.time(s <- d1[d2, on="x==y"])
all.equal(b, s)
#options(datatable.smerge=FALSE, datatable.verbose=TRUE)
#system.time(b <- d1[d2, on="x==y"])
#options(datatable.smerge=TRUE, datatable.verbose=TRUE)
#system.time(s <- d1[d2, on="x==y"])
# no duplicates
## single thread no index
user system elapsed
bmerge 394.931 32.802 427.750
smerge 336.447 73.674 410.139
## all threads no index
user system elapsed
bmerge 819.928 143.668 368.546
smerge 1136.770 166.111 100.471
## all threads index
user system elapsed
bmerge 658.381 103.473 377.290
smerge 579.559 109.165 78.886
## all threads sorted index
user system elapsed
bmerge 68.579 47.075 69.485
smerge 37.985 42.468 20.842
# single duplicate
## single thread no index
user system elapsed
bmerge 429.513 34.142 463.676
smerge 367.919 80.007 447.952
## all threads no index
user system elapsed
bmerge 819.215 149.786 368.750
smerge 1191.925 212.859 137.033
## all threads index
user system elapsed
bmerge 654.823 98.173 379.881
smerge 623.015 160.435 115.979
## all threads sorted index
user system elapsed
bmerge 87.594 47.124 94.729
smerge 71.851 64.715 42.599
# no duplicates
> options(datatable.smerge=FALSE, datatable.verbose=TRUE)
> system.time(b <- d1[d2, on="x==y"])
i.y has same type (integer) as x.x. No coercion needed.
on= matches existing key, using key
Starting bmerge ...
bmerge done in 45.6s elapsed (42.7s cpu)
Constructing irows for '!byjoin || nqbyjoin' ... 0.000s elapsed (0.000s cpu)
user system elapsed
69.703 34.536 73.928
> options(datatable.smerge=TRUE, datatable.verbose=TRUE)
> system.time(s <- d1[d2, on="x==y"])
Starting smerge ...
smergeR: index (already indexed) took 0.000s
smergeR: sort (already sorted) took 0.000s
smergeR: alloc of size 1000000000 took 0.492s
smergeC: grpLens (already unique) took 0.000s
batching: input 1000000000 into balanced 80 batches (batchSize=12500000, lastBat
chSize=12500000) of sorted x y: x[1]<=y[1] && x[nx]>=y[ny]:
...
smergeC: preparing 80 batches took 0.003s
smergeC: 80 calls to smerge using 40/40 threads took 0.339s
smergeR: smergeC of 1000000000 x 1000000000 = 1000000000; took 0.342s
smergeR: outSmerge (was sorted) took 0.000s
smergeR: all took 0.835s
smerge done in 0.836s elapsed (14.8s cpu)
Constructing irows for '!byjoin || nqbyjoin' ... 0.001s elapsed (0.037s cpu)
user system elapsed
38.082 42.926 20.368
# single duplicate
> options(datatable.smerge=FALSE, datatable.verbose=TRUE)
> system.time(b <- d1[d2, on="x==y"])
i.y has same type (integer) as x.x. No coercion needed.
on= matches existing key, using key
Starting bmerge ...
bmerge done in 46.6s elapsed (42.9s cpu)
Constructing irows for '!byjoin || nqbyjoin' ... 20.0s elapsed (15.6s cpu)
user system elapsed
85.789 53.102 91.941
> options(datatable.smerge=TRUE, datatable.verbose=TRUE)
> system.time(s <- d1[d2, on="x==y"])
Starting smerge ...
smergeR: index (already indexed) took 0.000s
smergeR: sort (already sorted) took 0.000s
smergeR: alloc of size 1000000000 took 0.472s
smergeC: grpLens (x, y) took 0.487s
batching: input 999999999 into balanced 80 batches (batchSize=12500000, lastBatc
hSize=12499999) of sorted x y: x[1]<=y[1] && x[nx]>=y[ny]:
...
smergeC: preparing 80 batches took 0.002s
smergeC: 80 calls to smerge using 40/40 threads took 0.797s
smergeR: smergeC of 1000000000 x 1000000000 = 1000000001; took 1.286s
smergeR: outSmerge (was sorted) took 0.000s
smergeR: all took 1.759s
smerge done in 1.760s elapsed (34.0s cpu)
Constructing irows for '!byjoin || nqbyjoin' ... 20.1s elapsed (15.9s cpu)
user system elapsed
74.858 59.953 43.362
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment