Last active
January 1, 2016 10:29
-
-
Save nalimilan/8132114 to your computer and use it in GitHub Desktop.
Draft implementation of table() for PooledDataVectors
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using DataArrays | |
using NamedArrays | |
function table(x::PooledDataVector...; usena::Bool = false) | |
n = length(x) | |
l = [length(y) for y in x] | |
for i in 1:n | |
if l[1] != l[i] | |
error("arguments are not of the same length: $l") | |
end | |
end | |
lev = [levels(y) for y in x] | |
if usena | |
el = Array(Int, n) | |
nalev = [length(l) + 1 for l in lev] | |
a = zeros(Int, ntuple(n, i -> length(lev[i]) + 1)) | |
for i in 1:l[1] | |
for j in 1:n | |
val = int(x[j].refs[i]) | |
@inbounds el[j] = val == zero(val) ? nalev[j] : val | |
end | |
@inbounds a[el...] += 1 | |
end | |
NamedArray(a, ntuple(n, i -> [lev[i], "NA"]), ntuple(n, i -> "Dim$i")) | |
else | |
a = zeros(Int, ntuple(n, i -> length(lev[i]))) | |
for i in 1:l[1] | |
el = ntuple(n, j -> x[j].refs[i]) | |
pos = true | |
for val in el | |
if val == zero(val) | |
pos = false | |
break | |
end | |
end | |
if pos | |
@inbounds a[el...] += 1 | |
end | |
end | |
NamedArrays.NamedArray(a, ntuple(n, i -> lev[i]), ntuple(n, i -> "Dim$i")) | |
end | |
end | |
function table2(x::PooledDataVector...; usena::Bool = false) | |
n = length(x) | |
l = [length(y) for y in x] | |
for i in 1:n | |
if l[1] != l[i] | |
error("arguments are not of the same length: $l") | |
end | |
end | |
lev = [levels(y) for y in x] | |
el = Array(Int, n) | |
if usena | |
nalev = [length(l) + 1 for l in lev] | |
a = zeros(Int, ntuple(n, i -> length(lev[i]) + 1)) | |
for i in 1:l[1] | |
for j in 1:n | |
val = int(x[j].refs[i]) | |
@inbounds el[j] = val == zero(val) ? nalev[j] : val | |
end | |
@inbounds a[el...] += 1 | |
end | |
NamedArray(a, ntuple(n, i -> [lev[i], "NA"]), ntuple(n, i -> "Dim$i")) | |
else | |
a = zeros(Int, ntuple(n, i -> length(lev[i]))) | |
for i in 1:l[1] | |
# ONLY CHANGE IS HERE | |
for j in 1:n | |
el[j] = x[j].refs[i] | |
end | |
pos = true | |
for val in el | |
if val == zero(val) | |
pos = false | |
break | |
end | |
end | |
if pos | |
@inbounds a[el...] += 1 | |
end | |
end | |
NamedArrays.NamedArray(a, ntuple(n, i -> lev[i]), ntuple(n, i -> "Dim$i")) | |
end | |
end | |
function table3(x::PooledDataVector...; usena::Bool = false) | |
n = length(x) | |
len = [length(y) for y in x] | |
for i in 1:n | |
if len[1] != len[i] | |
error(string("arguments are not of the same length: ", tuple(len...))) | |
end | |
end | |
lev = [levels(y) for y in x] | |
if usena | |
dims = ntuple(n, i -> length(lev[i]) + 1) | |
# The first way of building nalev gives and Any array, which hurts performance | |
# nalev = [dim + 1 for dim in dims] | |
nalev = [length(lev[i]) + 1 for i in 1:n] | |
sizes = cumprod(nalev) | |
a = zeros(Int, dims) | |
for i in 1:len[1] | |
el = int(x[1].refs[i])::Int | |
for j in 2:n | |
val = int(x[j].refs[i])::Int | |
if val == zero(val) | |
val = nalev[j] | |
end | |
el += int((val - 1) * sizes[j - 1])::Int | |
end | |
@inbounds a[el] += 1 | |
end | |
NamedArray(a, ntuple(n, i -> [lev[i], "NA"]), ntuple(n, i -> "Dim$i")) | |
else | |
dims = ntuple(n, i -> length(lev[i])) | |
sizes = cumprod([dims...]) | |
a = zeros(Int, dims) | |
for i in 1:len[1] | |
pos = (x[1].refs[i] != zero(Uint)) | |
el = int(x[1].refs[i])::Int | |
for j in 2:n | |
val = x[j].refs[i] | |
if val == zero(val) | |
pos = false | |
break | |
end | |
el += int((val - 1) * sizes[j - 1])::Int | |
end | |
if pos | |
@inbounds a[el] += 1 | |
end | |
end | |
NamedArrays.NamedArray(a, ntuple(n, i -> lev[i]), ntuple(n, i -> "Dim$i")) | |
end | |
end | |
## To test | |
a = PooledDataArray(rep(1:10, 100000)) | |
precompile(table, (a,)) | |
precompile(table2, (a,)) | |
precompile(table3, (a,)) | |
@time table(a) | |
@time table2(a) | |
@time table3(a) | |
precompile(table, (a, a)) | |
precompile(table2, (a, a)) | |
precompile(table3, (a, a)) | |
@time table(a, a) | |
@time table2(a, a) | |
@time table3(a, a) |
table3()
, which computes the linear index manually, is on par with R's performance for the one dimensional case, and very slightly slower for the two- and three-dimensional cases. With a few small changes we can probably get a little more efficient.
With more type annotations, table3()
is really fast: 0.1s and only 3920 bytes allocated for a cross table of 1M by 1M elements as in the example above. table()
takes 1.4s with 279MB allocated, and table2()
2.5s with 472MB. R takes 0.28s for the same task.
@johnmyleswhite Are you happy with the code above? What do you think of the interface? R allows useNA
to be ifany
, always
, never
-- I'm not sure that's useful. We should also think about supporting weights, which should be quite easy.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
@johnmyleswhyte I've added a
table2()
function which uses a loop instead ofntuple()
with an anonymous function. The result is almost twice slower and it allocates twice more memory. I think this is becausea[el...]
is much more efficient whenel
is a tuple than when it is an array.