Created
January 21, 2019 07:37
-
-
Save aquasync/20006038a6d06b63ba4f3402e87e3024 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(data.table) | |
dyn.load('mmap_alloc.dll') | |
## create test ~74GB dataset, similar to | |
## https://github.com/Rdatatable/data.table/wiki/Benchmarks-:-Grouping | |
N=2e9; K=100 | |
set.seed(1) | |
DT = list() | |
# empty mmap columns | |
DT$id1 = .Call('mmap_vector', 1L, 'id1.bin', N) | |
DT$id2 = .Call('mmap_vector', 1L, 'id2.bin', N) | |
DT$id3 = .Call('mmap_vector', 1L, 'id3.bin', N) | |
DT$id4 = .Call('mmap_vector', 1L, 'id4.bin', N) | |
DT$id5 = .Call('mmap_vector', 1L, 'id5.bin', N) | |
DT$id6 = .Call('mmap_vector', 1L, 'id6.bin', N) | |
DT$v1 = .Call('mmap_vector', 1L, 'v1.bin', N) | |
DT$v2 = .Call('mmap_vector', 1L, 'v2.bin', N) | |
DT$v3 = .Call('mmap_vector', 1.0, 'v3.bin', N) | |
setDT(DT) | |
# populate each column | |
fac1 = factor(sprintf("id%03d",1:K)) | |
fac2 = factor(sprintf("id%03d",1:K)) | |
fac3 = factor(sprintf("id%010d",1:(N/K))) | |
n = 1e7 # batch size | |
for (i in seq_len(N/n)) { | |
print(i) | |
DT[((i-1)*n+1):(i*n), `:=`( | |
id1 = sample(fac1, n, TRUE), # large groups (char) | |
id2 = sample(fac2, n, TRUE), # large groups (char) | |
id3 = sample(fac3, n, TRUE), # small groups (char) | |
id4 = sample(K, n, TRUE), # large groups (int) | |
id5 = sample(K, n, TRUE), # large groups (int) | |
id6 = sample(N/K, n, TRUE), # small groups (int) | |
v1 = sample(5, n, TRUE), # int in range [1,5] | |
v2 = sample(5, n, TRUE), # int in range [1,5] | |
v3 = sample(round(runif(100,max=100),4), n, TRUE) # numeric e.g. 23.5749 | |
)] | |
} | |
setattr(DT$id1, 'class', 'factor') | |
setattr(DT$id1, 'levels', levels(fac1)) | |
setattr(DT$id2, 'class', 'factor') | |
setattr(DT$id2, 'levels', levels(fac2)) | |
setattr(DT$id3, 'class', 'factor') | |
setattr(DT$id3, 'levels', levels(fac3)) | |
saveRDS(attributes(DT$id1), 'id1.rds') | |
saveRDS(attributes(DT$id2), 'id2.rds') | |
saveRDS(attributes(DT$id3), 'id3.rds') | |
# dataset is now persisted, can quit R | |
q() | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(data.table) | |
dyn.load('mmap_alloc.dll') | |
## in a fresh session, map the test dataset | |
DT = list() | |
# populate mmap columns | |
DT$id1 = .Call('mmap_vector', 1L, 'id1.bin', N) | |
DT$id2 = .Call('mmap_vector', 1L, 'id2.bin', N) | |
DT$id3 = .Call('mmap_vector', 1L, 'id3.bin', N) | |
DT$id4 = .Call('mmap_vector', 1L, 'id4.bin', N) | |
DT$id5 = .Call('mmap_vector', 1L, 'id5.bin', N) | |
DT$id6 = .Call('mmap_vector', 1L, 'id6.bin', N) | |
DT$v1 = .Call('mmap_vector', 1L, 'v1.bin', N) | |
DT$v2 = .Call('mmap_vector', 1L, 'v2.bin', N) | |
DT$v3 = .Call('mmap_vector', 1.0, 'v3.bin', N) | |
attrs = readRDS('id1.rds') | |
for (a in names(attrs)) setattr(DT$id1, a, attrs[[a]]) | |
attrs = readRDS('id2.rds') | |
for (a in names(attrs)) setattr(DT$id2, a, attrs[[a]]) | |
attrs = readRDS('id3.rds') | |
for (a in names(attrs)) setattr(DT$id3, a, attrs[[a]]) | |
rm(attrs) | |
setDT(DT) | |
as.numeric(object.size(DT)) / (1024^3) | |
# => [1] 75.84692 | |
print(DT) | |
# => | |
# id1 id2 id3 id4 id5 id6 v1 v2 v3 | |
# 1: id027 id041 id0019118478 18 7 11405826 1 1 66.5912 | |
# 2: id038 id080 id0010451099 69 5 4004947 4 1 83.3882 | |
# 3: id058 id061 id0001640694 24 79 5784478 5 2 24.3000 | |
# 4: id091 id003 id0015476884 52 82 11509253 1 3 24.8819 | |
# 5: id021 id072 id0012410384 16 24 428139 1 3 14.0597 | |
# --- | |
# 1999999996: id082 id082 id0014122699 33 32 15834208 3 4 21.1701 | |
# 1999999997: id087 id005 id0012759520 84 77 10500557 5 5 9.7848 | |
# 1999999998: id025 id013 id0000760988 30 69 11662435 3 4 13.4854 | |
# 1999999999: id061 id056 id0000080427 41 27 4990964 1 3 96.5136 | |
# 2000000000: id044 id097 id0016173239 48 24 1969290 3 2 32.9924 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment