Skip to content

Instantly share code, notes, and snippets.

@nalimilan
Created March 7, 2017 15:50
Show Gist options
  • Save nalimilan/aa9391f204967adf70fce3700eab5885 to your computer and use it in GitHub Desktop.
Save nalimilan/aa9391f204967adf70fce3700eab5885 to your computer and use it in GitHub Desktop.
using DataTables
using BenchmarkTools
using Distributions
srand(1);
a = NullableArray(rand([1, 2, 4], 2000), rand(2000) .> .75)
b = rand([:a, :b], 2000);
c = rand(["a", "b"], 2000);
d = rand(rand(2), 2000);
small = DataTable(A = a, B = b, C = c, D = c);
A = repeat(rand(889100), inner=5);
B = repeat(rand(889100), inner=5);
C = repeat(rand(889100), inner=5);
D = repeat(rand(889100), inner=5);
E = repeat(rand(889100), inner=5);
F = repeat(rand(889100*5));
large = DataTable(A = A, B = B, C = C, D = D, E = E, F = F);
dt1 = DataTable(v1 = CategoricalArray(repeat(1:10, inner=1000)), v2 = CategoricalArray(repeat(1:10, outer=1000)));
dt2 = DataTable(v1 = CategoricalArray(repeat(1:100, inner=100)), v2 = CategoricalArray(repeat(1:100, outer=100)));
dt3 = hcat(dt1, dt2);
function random_frame(nrow::Int, col_values::Dict{Symbol, Any})
DataTable(Any[isa(col_values[key], CategoricalArray) ?
categorical(sample(col_values[key], nrow)) :
NullableArray(sample(col_values[key], nrow)) for key in keys(col_values)],
keys(col_values) |> collect)
end
function random_join(kind::Symbol, nrow_left::Int, nrow_right::Int,
on_col_values::Dict{Symbol, Any},
left_col_values::Dict{Symbol, Any},
right_col_values::Dict{Symbol, Any})
dtl = random_frame(nrow_left, merge(on_col_values, left_col_values))
dtr = random_frame(nrow_right, merge(on_col_values, right_col_values))
join(dtl, dtr, on = keys(on_col_values) |> collect, kind = kind)
end
function f()
random_join(:outer, 1000, 2000,
Dict{Symbol,Any}(:A => 1:10, :B => [:A, :B, :C, :D],
:C => 1:10, :D => 1:10),
Dict{Symbol,Any}(:E => 1:10, :F => [:A, :B, :C, :D]),
Dict{Symbol,Any}(:G => 1:10, :H => [:A, :B, :C, :D]))
end
function h()
random_join(:outer, 10000, 20000,
Dict{Symbol,Any}(:A => collect(1:10000)),
Dict{Symbol,Any}(:B => collect(1:10000)),
Dict{Symbol,Any}(:C => collect(1:10000)))
end
@benchmark groupby(small, [:A, :B])
@benchmark groupby(large, [:A, :B])
@benchmark groupby(large, [:A, :B, :C, :D, :E])
@benchmark groupby(dt1, [:v1, :v2])
@benchmark groupby(dt2, [:v1, :v2])
@benchmark groupby(dt3, [:v1, :v2, :v1_1, :v2_1])
@benchmark f()
@benchmark h()
using DataFrames
using BenchmarkTools
using Distributions
srand(1);
a = DataArray(rand([1, 2, 4], 2000), rand(2000) .> .75)
b = rand([:a, :b], 2000);
c = rand(["a", "b"], 2000);
d = rand(rand(2), 2000);
small = DataFrame(A = a, B = b, C = c, D = c);
A = repeat(rand(889100), inner=5);
B = repeat(rand(889100), inner=5);
C = repeat(rand(889100), inner=5);
D = repeat(rand(889100), inner=5);
E = repeat(rand(889100), inner=5);
F = repeat(rand(889100*5));
large = DataFrame(A = A, B = B, C = C, D = D, E = E, F = F);
dt1 = DataFrame(v1 = PooledDataArray(repeat(1:10, inner=1000)), v2 = PooledDataArray(repeat(1:10, outer=1000)));
dt2 = DataFrame(v1 = PooledDataArray(repeat(1:100, inner=100)), v2 = PooledDataArray(repeat(1:100, outer=100)));
dt3 = hcat(dt1, dt2);
function random_frame(nrow::Int, col_values::Dict{Symbol, Any})
DataFrame(Any[isa(col_values[key], PooledDataArray) ?
PooledDataArray(sample(col_values[key], nrow)) :
DataArray(sample(col_values[key], nrow)) for key in keys(col_values)],
keys(col_values) |> collect)
end
function random_join(kind::Symbol, nrow_left::Int, nrow_right::Int,
on_col_values::Dict{Symbol, Any},
left_col_values::Dict{Symbol, Any},
right_col_values::Dict{Symbol, Any})
dtl = random_frame(nrow_left, merge(on_col_values, left_col_values))
dtr = random_frame(nrow_right, merge(on_col_values, right_col_values))
join(dtl, dtr, on = keys(on_col_values) |> collect, kind = kind)
end
function f()
random_join(:outer, 1000, 2000,
Dict{Symbol,Any}(:A => 1:10, :B => [:A, :B, :C, :D],
:C => 1:10, :D => 1:10),
Dict{Symbol,Any}(:E => 1:10, :F => [:A, :B, :C, :D]),
Dict{Symbol,Any}(:G => 1:10, :H => [:A, :B, :C, :D]))
end
function h()
random_join(:outer, 10000, 20000,
Dict{Symbol,Any}(:A => collect(1:10000)),
Dict{Symbol,Any}(:B => collect(1:10000)),
Dict{Symbol,Any}(:C => collect(1:10000)))
end
@benchmark groupby(small, [:A, :B])
@benchmark groupby(large, [:A, :B])
@benchmark groupby(large, [:A, :B, :C, :D, :E])
@benchmark groupby(dt1, [:v1, :v2])
@benchmark groupby(dt2, [:v1, :v2])
@benchmark groupby(dt3, [:v1, :v2, :v1_1, :v2_1])
@benchmark f()
@benchmark h()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment