-
-
Save nalimilan/8132114 to your computer and use it in GitHub Desktop.
using DataArrays | |
using NamedArrays | |
function table(x::PooledDataVector...; usena::Bool = false) | |
n = length(x) | |
l = [length(y) for y in x] | |
for i in 1:n | |
if l[1] != l[i] | |
error("arguments are not of the same length: $l") | |
end | |
end | |
lev = [levels(y) for y in x] | |
if usena | |
el = Array(Int, n) | |
nalev = [length(l) + 1 for l in lev] | |
a = zeros(Int, ntuple(n, i -> length(lev[i]) + 1)) | |
for i in 1:l[1] | |
for j in 1:n | |
val = int(x[j].refs[i]) | |
@inbounds el[j] = val == zero(val) ? nalev[j] : val | |
end | |
@inbounds a[el...] += 1 | |
end | |
NamedArray(a, ntuple(n, i -> [lev[i], "NA"]), ntuple(n, i -> "Dim$i")) | |
else | |
a = zeros(Int, ntuple(n, i -> length(lev[i]))) | |
for i in 1:l[1] | |
el = ntuple(n, j -> x[j].refs[i]) | |
pos = true | |
for val in el | |
if val == zero(val) | |
pos = false | |
break | |
end | |
end | |
if pos | |
@inbounds a[el...] += 1 | |
end | |
end | |
NamedArrays.NamedArray(a, ntuple(n, i -> lev[i]), ntuple(n, i -> "Dim$i")) | |
end | |
end | |
function table2(x::PooledDataVector...; usena::Bool = false) | |
n = length(x) | |
l = [length(y) for y in x] | |
for i in 1:n | |
if l[1] != l[i] | |
error("arguments are not of the same length: $l") | |
end | |
end | |
lev = [levels(y) for y in x] | |
el = Array(Int, n) | |
if usena | |
nalev = [length(l) + 1 for l in lev] | |
a = zeros(Int, ntuple(n, i -> length(lev[i]) + 1)) | |
for i in 1:l[1] | |
for j in 1:n | |
val = int(x[j].refs[i]) | |
@inbounds el[j] = val == zero(val) ? nalev[j] : val | |
end | |
@inbounds a[el...] += 1 | |
end | |
NamedArray(a, ntuple(n, i -> [lev[i], "NA"]), ntuple(n, i -> "Dim$i")) | |
else | |
a = zeros(Int, ntuple(n, i -> length(lev[i]))) | |
for i in 1:l[1] | |
# ONLY CHANGE IS HERE | |
for j in 1:n | |
el[j] = x[j].refs[i] | |
end | |
pos = true | |
for val in el | |
if val == zero(val) | |
pos = false | |
break | |
end | |
end | |
if pos | |
@inbounds a[el...] += 1 | |
end | |
end | |
NamedArrays.NamedArray(a, ntuple(n, i -> lev[i]), ntuple(n, i -> "Dim$i")) | |
end | |
end | |
function table3(x::PooledDataVector...; usena::Bool = false) | |
n = length(x) | |
len = [length(y) for y in x] | |
for i in 1:n | |
if len[1] != len[i] | |
error(string("arguments are not of the same length: ", tuple(len...))) | |
end | |
end | |
lev = [levels(y) for y in x] | |
if usena | |
dims = ntuple(n, i -> length(lev[i]) + 1) | |
# The first way of building nalev gives and Any array, which hurts performance | |
# nalev = [dim + 1 for dim in dims] | |
nalev = [length(lev[i]) + 1 for i in 1:n] | |
sizes = cumprod(nalev) | |
a = zeros(Int, dims) | |
for i in 1:len[1] | |
el = int(x[1].refs[i])::Int | |
for j in 2:n | |
val = int(x[j].refs[i])::Int | |
if val == zero(val) | |
val = nalev[j] | |
end | |
el += int((val - 1) * sizes[j - 1])::Int | |
end | |
@inbounds a[el] += 1 | |
end | |
NamedArray(a, ntuple(n, i -> [lev[i], "NA"]), ntuple(n, i -> "Dim$i")) | |
else | |
dims = ntuple(n, i -> length(lev[i])) | |
sizes = cumprod([dims...]) | |
a = zeros(Int, dims) | |
for i in 1:len[1] | |
pos = (x[1].refs[i] != zero(Uint)) | |
el = int(x[1].refs[i])::Int | |
for j in 2:n | |
val = x[j].refs[i] | |
if val == zero(val) | |
pos = false | |
break | |
end | |
el += int((val - 1) * sizes[j - 1])::Int | |
end | |
if pos | |
@inbounds a[el] += 1 | |
end | |
end | |
NamedArrays.NamedArray(a, ntuple(n, i -> lev[i]), ntuple(n, i -> "Dim$i")) | |
end | |
end | |
## To test | |
a = PooledDataArray(rep(1:10, 100000)) | |
precompile(table, (a,)) | |
precompile(table2, (a,)) | |
precompile(table3, (a,)) | |
@time table(a) | |
@time table2(a) | |
@time table3(a) | |
precompile(table, (a, a)) | |
precompile(table2, (a, a)) | |
precompile(table3, (a, a)) | |
@time table(a, a) | |
@time table2(a, a) | |
@time table3(a, a) |
@johnmyleswhyte I've added a table2()
function which uses a loop instead of ntuple()
with an anonymous function. The result is almost twice slower and it allocates twice more memory. I think this is because a[el...]
is much more efficient when el
is a tuple than when it is an array.
table3()
, which computes the linear index manually, is on par with R's performance for the one dimensional case, and very slightly slower for the two- and three-dimensional cases. With a few small changes we can probably get a little more efficient.
With more type annotations, table3()
is really fast: 0.1s and only 3920 bytes allocated for a cross table of 1M by 1M elements as in the example above. table()
takes 1.4s with 279MB allocated, and table2()
2.5s with 472MB. R takes 0.28s for the same task.
@johnmyleswhite Are you happy with the code above? What do you think of the interface? R allows useNA
to be ifany
, always
, never
-- I'm not sure that's useful. We should also think about supporting weights, which should be quite easy.
Yeah, but
map
is really not the bottleneck since it's only run once. I tried replacing anonymous functions with real ones, and the difference was really small. So I thought I would keep them for now, but they are easy to replace.