Skip to content

Instantly share code, notes, and snippets.

@tshort
Created June 19, 2012 15:44
Show Gist options
  • Save tshort/2954892 to your computer and use it in GitHub Desktop.
Save tshort/2954892 to your computer and use it in GitHub Desktop.
with/transform for DataFrames

Here are versions of with and transform! for DataFrame's. They each operate on columns.

function with(df::DataFrame, ex::Expr)
    # by-column with operation on df
    
    # helper function to replace symbols in ex with a reference to the
    # appropriate column in df
    replace_symbols(x, syms::Dict) = x
    replace_symbols(e::Expr, syms::Dict) = Expr(e.head, isempty(e.args) ? e.args : map(x -> replace_symbols(x, syms), e.args), e.typ)
    function replace_symbols(s::Symbol, syms::Dict)
        if contains(keys(syms), string(s))
            :(df[$(syms[string(s)])])
        else
            s
        end
    end
    # Make a dict of colnames and column positions
    cn_dict = dict(tuple(colnames(df)...), tuple([1:ncol(df)]...))
    ex = replace_symbols(ex, cn_dict)
    f = @eval (df) -> $ex
    f(df)
end

transform!(df::DataFrame, colname::String, ex::Expr) = 
    df[colname] = with(df, ex)

Here is an example:

julia> c1 = DataVec([5:8])
[5,6,7,8]

julia> c2 = DataVec([5:8] + 3)
[8,9,10,11]

julia> dv = DataVec([5:8] - 3)
[2,3,4,5]

julia> df = DataFrame({c1, c2}, ["C1", "C2"])
      C1 C2
[1,]   5  8
[2,]   6  9
[3,]   7 10
[4,]   8 11

julia> a = with(df, :( C1 + C2 - 1 ))
[12,14,16,18]

julia> transform!(df, "Cnew", :( C1 + C2 - 1 ))
[12,14,16,18]

julia> df
      C1 C2 Cnew
[1,]   5  8   12
[2,]   6  9   14
[3,]   7 10   16
[4,]   8 11   18

Here are similar functions that operate element by element. With these, there is much less copying.

function withe(df::DataFrame, ex::Expr)
    # element-by-element with operation on df
    
    # helper function to replace symbols in ex with a reference to the
    # appropriate column in df
    replace_symbols(x, syms::Dict) = x
    replace_symbols(e::Expr, syms::Dict) = Expr(e.head, isempty(e.args) ? e.args : map(x -> replace_symbols(x, syms), e.args), e.typ)
    function replace_symbols(s::Symbol, syms::Dict)
        if contains(keys(syms), string(s))
            :(df[rowidx, $(syms[string(s)])])
        else
            s
        end
    end
    # Make a dict of colnames and column positions
    cn_dict = dict(tuple(colnames(df)...), tuple([1:ncol(df)]...))
    ex = replace_symbols(ex, cn_dict)
    f = @eval (df, rowidx) -> $ex
    firstel = f(df, 1) # first element finds the type
    res = DataVec(fill(firstel, nrow(df)))
    for rowidx = 1:nrow(df)
        res[rowidx] = f(df, rowidx)
    end
    res
end

# element-by-element transform on df
transforme!(df::DataFrame, colname::String, ex::Expr) = 
    df[colname] = with(df, ex)

Here is an example:

julia> with(df, :( C1 + C2 - 1 ))         
[12,14,16,18]

julia> withe(df, :( C1 + C2 - 1 ))
[12,14,16,18]

With reduction operators, element-by-element operations are different:

julia> with(df, :( C1 + sum(C2) - 1 ))    
[42,43,44,45]

julia> withe(df, :( C1 + sum(C2) - 1 )) 
[12,14,16,18]

Also, element-by-element operation is broken for NA's because element-by-element operations with NA's are not supported, yet (example: 1 + NA).

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment