-
-
Save nalimilan/8132114 to your computer and use it in GitHub Desktop.
using DataArrays | |
using NamedArrays | |
function table(x::PooledDataVector...; usena::Bool = false) | |
n = length(x) | |
l = [length(y) for y in x] | |
for i in 1:n | |
if l[1] != l[i] | |
error("arguments are not of the same length: $l") | |
end | |
end | |
lev = [levels(y) for y in x] | |
if usena | |
el = Array(Int, n) | |
nalev = [length(l) + 1 for l in lev] | |
a = zeros(Int, ntuple(n, i -> length(lev[i]) + 1)) | |
for i in 1:l[1] | |
for j in 1:n | |
val = int(x[j].refs[i]) | |
@inbounds el[j] = val == zero(val) ? nalev[j] : val | |
end | |
@inbounds a[el...] += 1 | |
end | |
NamedArray(a, ntuple(n, i -> [lev[i], "NA"]), ntuple(n, i -> "Dim$i")) | |
else | |
a = zeros(Int, ntuple(n, i -> length(lev[i]))) | |
for i in 1:l[1] | |
el = ntuple(n, j -> x[j].refs[i]) | |
pos = true | |
for val in el | |
if val == zero(val) | |
pos = false | |
break | |
end | |
end | |
if pos | |
@inbounds a[el...] += 1 | |
end | |
end | |
NamedArrays.NamedArray(a, ntuple(n, i -> lev[i]), ntuple(n, i -> "Dim$i")) | |
end | |
end | |
function table2(x::PooledDataVector...; usena::Bool = false) | |
n = length(x) | |
l = [length(y) for y in x] | |
for i in 1:n | |
if l[1] != l[i] | |
error("arguments are not of the same length: $l") | |
end | |
end | |
lev = [levels(y) for y in x] | |
el = Array(Int, n) | |
if usena | |
nalev = [length(l) + 1 for l in lev] | |
a = zeros(Int, ntuple(n, i -> length(lev[i]) + 1)) | |
for i in 1:l[1] | |
for j in 1:n | |
val = int(x[j].refs[i]) | |
@inbounds el[j] = val == zero(val) ? nalev[j] : val | |
end | |
@inbounds a[el...] += 1 | |
end | |
NamedArray(a, ntuple(n, i -> [lev[i], "NA"]), ntuple(n, i -> "Dim$i")) | |
else | |
a = zeros(Int, ntuple(n, i -> length(lev[i]))) | |
for i in 1:l[1] | |
# ONLY CHANGE IS HERE | |
for j in 1:n | |
el[j] = x[j].refs[i] | |
end | |
pos = true | |
for val in el | |
if val == zero(val) | |
pos = false | |
break | |
end | |
end | |
if pos | |
@inbounds a[el...] += 1 | |
end | |
end | |
NamedArrays.NamedArray(a, ntuple(n, i -> lev[i]), ntuple(n, i -> "Dim$i")) | |
end | |
end | |
function table3(x::PooledDataVector...; usena::Bool = false) | |
n = length(x) | |
len = [length(y) for y in x] | |
for i in 1:n | |
if len[1] != len[i] | |
error(string("arguments are not of the same length: ", tuple(len...))) | |
end | |
end | |
lev = [levels(y) for y in x] | |
if usena | |
dims = ntuple(n, i -> length(lev[i]) + 1) | |
# The first way of building nalev gives and Any array, which hurts performance | |
# nalev = [dim + 1 for dim in dims] | |
nalev = [length(lev[i]) + 1 for i in 1:n] | |
sizes = cumprod(nalev) | |
a = zeros(Int, dims) | |
for i in 1:len[1] | |
el = int(x[1].refs[i])::Int | |
for j in 2:n | |
val = int(x[j].refs[i])::Int | |
if val == zero(val) | |
val = nalev[j] | |
end | |
el += int((val - 1) * sizes[j - 1])::Int | |
end | |
@inbounds a[el] += 1 | |
end | |
NamedArray(a, ntuple(n, i -> [lev[i], "NA"]), ntuple(n, i -> "Dim$i")) | |
else | |
dims = ntuple(n, i -> length(lev[i])) | |
sizes = cumprod([dims...]) | |
a = zeros(Int, dims) | |
for i in 1:len[1] | |
pos = (x[1].refs[i] != zero(Uint)) | |
el = int(x[1].refs[i])::Int | |
for j in 2:n | |
val = x[j].refs[i] | |
if val == zero(val) | |
pos = false | |
break | |
end | |
el += int((val - 1) * sizes[j - 1])::Int | |
end | |
if pos | |
@inbounds a[el] += 1 | |
end | |
end | |
NamedArrays.NamedArray(a, ntuple(n, i -> lev[i]), ntuple(n, i -> "Dim$i")) | |
end | |
end | |
## To test | |
a = PooledDataArray(rep(1:10, 100000)) | |
precompile(table, (a,)) | |
precompile(table2, (a,)) | |
precompile(table3, (a,)) | |
@time table(a) | |
@time table2(a) | |
@time table3(a) | |
precompile(table, (a, a)) | |
precompile(table2, (a, a)) | |
precompile(table3, (a, a)) | |
@time table(a, a) | |
@time table2(a, a) | |
@time table3(a, a) |
Yeah, but map
is really not the bottleneck since it's only run once. I tried replacing anonymous functions with real ones, and the difference was really small. So I thought I would keep them for now, but they are easy to replace.
@johnmyleswhyte I've added a table2()
function which uses a loop instead of ntuple()
with an anonymous function. The result is almost twice slower and it allocates twice more memory. I think this is because a[el...]
is much more efficient when el
is a tuple than when it is an array.
table3()
, which computes the linear index manually, is on par with R's performance for the one dimensional case, and very slightly slower for the two- and three-dimensional cases. With a few small changes we can probably get a little more efficient.
With more type annotations, table3()
is really fast: 0.1s and only 3920 bytes allocated for a cross table of 1M by 1M elements as in the example above. table()
takes 1.4s with 279MB allocated, and table2()
2.5s with 472MB. R takes 0.28s for the same task.
@johnmyleswhite Are you happy with the code above? What do you think of the interface? R allows useNA
to be ifany
, always
, never
-- I'm not sure that's useful. We should also think about supporting weights, which should be quite easy.
If you're interested in writing library code, you should avoid anonymous functions and calls to map. Both are too slow to be used in backend code.