Here are versions of with and transform! for DataFrame's. They each operate on columns.
function with(df::DataFrame, ex::Expr)
# by-column with operation on df
# helper function to replace symbols in ex with a reference to the
# appropriate column in df
replace_symbols(x, syms::Dict) = x
replace_symbols(e::Expr, syms::Dict) = Expr(e.head, isempty(e.args) ? e.args : map(x -> replace_symbols(x, syms), e.args), e.typ)
function replace_symbols(s::Symbol, syms::Dict)
if contains(keys(syms), string(s))
:(df[$(syms[string(s)])])
else
s
end
end
# Make a dict of colnames and column positions
cn_dict = dict(tuple(colnames(df)...), tuple([1:ncol(df)]...))
ex = replace_symbols(ex, cn_dict)
f = @eval (df) -> $ex
f(df)
end
transform!(df::DataFrame, colname::String, ex::Expr) =
df[colname] = with(df, ex)
Here is an example:
julia> c1 = DataVec([5:8])
[5,6,7,8]
julia> c2 = DataVec([5:8] + 3)
[8,9,10,11]
julia> dv = DataVec([5:8] - 3)
[2,3,4,5]
julia> df = DataFrame({c1, c2}, ["C1", "C2"])
C1 C2
[1,] 5 8
[2,] 6 9
[3,] 7 10
[4,] 8 11
julia> a = with(df, :( C1 + C2 - 1 ))
[12,14,16,18]
julia> transform!(df, "Cnew", :( C1 + C2 - 1 ))
[12,14,16,18]
julia> df
C1 C2 Cnew
[1,] 5 8 12
[2,] 6 9 14
[3,] 7 10 16
[4,] 8 11 18
Here are similar functions that operate element by element. With these, there is much less copying.
function withe(df::DataFrame, ex::Expr)
# element-by-element with operation on df
# helper function to replace symbols in ex with a reference to the
# appropriate column in df
replace_symbols(x, syms::Dict) = x
replace_symbols(e::Expr, syms::Dict) = Expr(e.head, isempty(e.args) ? e.args : map(x -> replace_symbols(x, syms), e.args), e.typ)
function replace_symbols(s::Symbol, syms::Dict)
if contains(keys(syms), string(s))
:(df[rowidx, $(syms[string(s)])])
else
s
end
end
# Make a dict of colnames and column positions
cn_dict = dict(tuple(colnames(df)...), tuple([1:ncol(df)]...))
ex = replace_symbols(ex, cn_dict)
f = @eval (df, rowidx) -> $ex
firstel = f(df, 1) # first element finds the type
res = DataVec(fill(firstel, nrow(df)))
for rowidx = 1:nrow(df)
res[rowidx] = f(df, rowidx)
end
res
end
# element-by-element transform on df
transforme!(df::DataFrame, colname::String, ex::Expr) =
df[colname] = with(df, ex)
Here is an example:
julia> with(df, :( C1 + C2 - 1 ))
[12,14,16,18]
julia> withe(df, :( C1 + C2 - 1 ))
[12,14,16,18]
With reduction operators, element-by-element operations are different:
julia> with(df, :( C1 + sum(C2) - 1 ))
[42,43,44,45]
julia> withe(df, :( C1 + sum(C2) - 1 ))
[12,14,16,18]
Also, element-by-element operation is broken for NA's because element-by-element operations with NA's are not supported, yet (example: 1 + NA).