Last active
October 22, 2016 03:02
-
-
Save kleinschmidt/0125c14dd6d5b285e7bb82031e602ea1 to your computer and use it in GitHub Desktop.
benchmarking DataFrames#1081
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
- # Formulas for representing and working with linear-model-type expressions | |
- # Original by Harlan D. Harris. Later modifications by John Myles White | |
- # and Douglas M. Bates. | |
- | |
- ## Formulas are written as expressions and parsed by the Julia parser. | |
- ## For example :(y ~ a + b + log(c)) | |
- ## In Julia the & operator is used for an interaction. What would be written | |
- ## in R as y ~ a + b + a:b is written :(y ~ a + b + a&b) in Julia. | |
- ## The equivalent R expression, y ~ a*b, is the same in Julia | |
- | |
- ## The lhs of a one-sided formula is 'nothing' | |
- ## The rhs of a formula can be 1 | |
- | |
- type Formula | |
- lhs::@compat(Union{Symbol, Expr, Void}) | |
- rhs::@compat(Union{Symbol, Expr, Integer}) | |
- end | |
- | |
- macro ~(lhs, rhs) | |
- ex = Expr(:call, | |
- :Formula, | |
- Base.Meta.quot(lhs), | |
- Base.Meta.quot(rhs)) | |
- return ex | |
- end | |
- | |
- # | |
192000 # TODO: implement contrast types in Formula/Terms | |
- # | |
- type Terms | |
- terms::Vector | |
- eterms::Vector # evaluation terms | |
- factors::Matrix{Bool} # maps terms to evaluation terms | |
- ## An eterms x terms matrix which is true for terms that need to be "promoted" | |
- ## to full rank in constructing a model matrx | |
- is_non_redundant::Matrix{Bool} | |
- # order can probably be dropped. It is vec(sum(factors, 1)) | |
- order::Vector{Int} # orders of rhs terms | |
- response::Bool # indicator of a response, which is eterms[1] if present | |
- intercept::Bool # is there an intercept column in the model matrix? | |
- end | |
- | |
- type ModelFrame | |
- df::AbstractDataFrame | |
- terms::Terms | |
- msng::BitArray | |
- ## mapping from df keys to contrasts matrices | |
- contrasts::Dict{Symbol, ContrastsMatrix} | |
- end | |
- | |
- typealias AbstractFloatMatrix{T<:AbstractFloat} AbstractMatrix{T} | |
- | |
- type ModelMatrix{T <: AbstractFloatMatrix} | |
- m::T | |
- assign::Vector{Int} | |
- end | |
- | |
- Base.size(mm::ModelMatrix) = size(mm.m) | |
- Base.size(mm::ModelMatrix, dim...) = size(mm.m, dim...) | |
- | |
- function Base.show(io::IO, f::Formula) | |
- print(io, | |
- string("Formula: ", | |
- f.lhs == nothing ? "" : f.lhs, " ~ ", f.rhs)) | |
576000 end | |
- | |
- ## Return, as a vector of symbols, the names of all the variables in | |
- ## an expression or a formula | |
- function allvars(ex::Expr) | |
- if ex.head != :call error("Non-call expression encountered") end | |
- cc=Symbol[] | |
- for i in ex.args[2:end] cc=append!(cc,allvars(i)) end | |
- cc | |
- end | |
- allvars(f::Formula) = unique(vcat(allvars(f.rhs), allvars(f.lhs))) | |
- allvars(sym::Symbol) = [sym] | |
- allvars(v::Any) = Array(Symbol, 0) | |
- | |
- # special operators in formulas | |
- const specials = Set([:+, :-, :*, :/, :&, :|, :^]) | |
448000 | |
- function dospecials(ex::Expr) | |
0 if ex.head != :call error("Non-call expression encountered") end | |
0 a1 = ex.args[1] | |
0 if !(a1 in specials) return ex end | |
0 excp = copy(ex) | |
0 excp.args = vcat(a1,map(dospecials, ex.args[2:end])) | |
0 if a1 != :* return excp end | |
0 aa = excp.args | |
0 a2 = aa[2] | |
0 a3 = aa[3] | |
0 if length(aa) > 3 | |
0 excp.args = vcat(a1, aa[3:end]) | |
0 a3 = dospecials(excp) | |
- end | |
- ## this order of expansion gives the R-style ordering of interaction | |
- ## terms (after sorting in increasing interaction order) for higher- | |
- ## order interaction terms (e.g. x1 * x2 * x3 should expand to x1 + | |
0 ## x2 + x3 + x1&x2 + x1&x3 + x2&x3 + x1&x2&x3) | |
0 :($a2 + $a2 & $a3 + $a3) | |
- end | |
- dospecials(a::Any) = a | |
- | |
- ## Distribution of & over + | |
- const distributive = @compat Dict(:& => :+) | |
- | |
- distribute(ex::Expr) = distribute!(copy(ex)) | |
- distribute(a::Any) = a | |
- ## apply distributive property in-place | |
- function distribute!(ex::Expr) | |
0 if ex.head != :call error("Non-call expression encountered") end | |
0 [distribute!(a) for a in ex.args[2:end]] | |
- ## check that top-level can be distributed | |
0 a1 = ex.args[1] | |
0 if a1 in keys(distributive) | |
- | |
- ## which op is being DISTRIBUTED (e.g. &, *)? | |
0 distributed_op = a1 | |
- ## which op is doing the distributing (e.g. +)? | |
0 distributing_op = distributive[a1] | |
- | |
- ## detect distributing sub-expression (first arg is, e.g. :+) | |
0 is_distributing_subex(e) = | |
- typeof(e)==Expr && e.head == :call && e.args[1] == distributing_op | |
- | |
- ## find first distributing subex | |
0 first_distributing_subex = findfirst(is_distributing_subex, ex.args) | |
0 if first_distributing_subex != 0 | |
- ## remove distributing subexpression from args | |
0 subex = splice!(ex.args, first_distributing_subex) | |
- | |
0 newargs = Any[distributing_op] | |
- ## generate one new sub-expression, which calls the distributed operation | |
0 ## (e.g. &) on each of the distributing sub-expression's arguments, plus | |
- ## the non-distributed arguments of the original expression. | |
0 for a in subex.args[2:end] | |
0 new_subex = copy(ex) | |
0 push!(new_subex.args, a) | |
- ## need to recurse here, in case there are any other | |
- ## distributing operations in the sub expression | |
0 distribute!(new_subex) | |
0 push!(newargs, new_subex) | |
- end | |
0 ex.args = newargs | |
- end | |
- end | |
0 ex | |
- end | |
- distribute!(a::Any) = a | |
- | |
- const associative = Set([:+,:*,:&]) # associative special operators | |
- | |
- ## If the expression is a call to the function s return its arguments | |
- ## Otherwise return the expression | |
- function ex_or_args(ex::Expr,s::Symbol) | |
- if ex.head != :call error("Non-call expression encountered") end | |
- if ex.args[1] == s | |
- ## recurse in case there are more :calls of s below | |
- return vcat(map(x -> ex_or_args(x, s), ex.args[2:end])...) | |
- else | |
- ## not a :call to s, return condensed version of ex | |
- return condense(ex) | |
- end | |
- end | |
- ex_or_args(a,s::Symbol) = a | |
- | |
- ## Condense calls like :(+(a,+(b,c))) to :(+(a,b,c)) | |
- function condense(ex::Expr) | |
0 if ex.head != :call error("Non-call expression encountered") end | |
0 a1 = ex.args[1] | |
0 if !(a1 in associative) return ex end | |
0 excp = copy(ex) | |
0 excp.args = vcat(a1, map(x->ex_or_args(x,a1), ex.args[2:end])...) | |
0 excp | |
- end | |
- condense(a::Any) = a | |
- | |
- ## always return an ARRAY of terms | |
- getterms(ex::Expr) = (ex.head == :call && ex.args[1] == :+) ? ex.args[2:end] : Expr[ex] | |
- getterms(a::Any) = Any[a] | |
- | |
- ord(ex::Expr) = (ex.head == :call && ex.args[1] == :&) ? length(ex.args)-1 : 1 | |
- ord(a::Any) = 1 | |
- | |
- const nonevaluation = Set([:&,:|]) # operators constructed from other evaluations | |
- ## evaluation terms - the (filtered) arguments for :& and :|, otherwise the term itself | |
- function evt(ex::Expr) | |
- if ex.head != :call error("Non-call expression encountered") end | |
- if !(ex.args[1] in nonevaluation) return ex end | |
- filter(x->!isa(x,Number), vcat(map(getterms, ex.args[2:end])...)) | |
- end | |
- evt(a) = Any[a] | |
- | |
- function Terms(f::Formula) | |
- rhs = condense(distribute(dospecials(f.rhs))) | |
0 tt = unique(getterms(rhs)) | |
0 tt = tt[!(tt .== 1)] # drop any explicit 1's | |
0 noint = (tt .== 0) | (tt .== -1) # should also handle :(-(expr,1)) | |
0 tt = tt[!noint] | |
0 oo = Int[ord(t) for t in tt] # orders of interaction terms | |
0 if !issorted(oo) # sort terms by increasing order | |
0 pp = sortperm(oo) | |
0 tt = tt[pp] | |
0 oo = oo[pp] | |
- end | |
0 etrms = map(evt, tt) | |
0 haslhs = f.lhs != nothing | |
0 if haslhs | |
0 unshift!(etrms, Any[f.lhs]) | |
0 unshift!(oo, 1) | |
- end | |
0 ev = unique(vcat(etrms...)) | |
0 sets = [Set(x) for x in etrms] | |
0 facs = Bool[t in s for t in ev, s in sets] | |
0 non_redundants = fill(false, size(facs)) # initialize to false | |
0 Terms(tt, ev, facs, non_redundants, oo, haslhs, !any(noint)) | |
- end | |
- | |
- ## Default NULL handler. Others can be added as keyword arguments | |
- function null_omit(df::DataFrame) | |
- cc = complete_cases(df) | |
0 sub(df, cc), cc | |
- end | |
- | |
- _droplevels!(x::Any) = x | |
- _droplevels!(x::Union{CategoricalArray, NullableCategoricalArray}) = droplevels!(x) | |
- | |
- is_categorical(::Union{CategoricalArray, NullableCategoricalArray}) = true | |
- is_categorical(::Any) = false | |
- | |
- ## Check for non-redundancy of columns. For instance, if x is a factor with two | |
- ## levels, it should be expanded into two columns in y~0+x but only one column | |
- ## in y~1+x. The default is the rank-reduced form (contrasts for n levels only | |
- ## produce n-1 columns). In general, an evaluation term x within a term | |
- ## x&y&... needs to be "promoted" to full rank if y&... hasn't already been | |
- ## included (either explicitly in the Terms or implicitly by promoting another | |
- ## term like z in z&y&...). | |
- ## | |
- ## This modifies the Terms, setting `trms.is_non_redundant = true` for all non- | |
- ## redundant evaluation terms. | |
- function check_non_redundancy!(trms::Terms, df::AbstractDataFrame) | |
- | |
- (n_eterms, n_terms) = size(trms.factors) | |
- | |
0 encountered_columns = Vector{eltype(trms.factors)}[] | |
- | |
0 if trms.intercept | |
0 push!(encountered_columns, zeros(eltype(trms.factors), n_eterms)) | |
- end | |
- | |
0 for i_term in 1:n_terms | |
0 for i_eterm in 1:n_eterms | |
- ## only need to check eterms that are included and can be promoted | |
- ## (e.g., categorical variables that expand to multiple mm columns) | |
0 if Bool(trms.factors[i_eterm, i_term]) && is_categorical(df[trms.eterms[i_eterm]]) | |
0 dropped = trms.factors[:,i_term] | |
0 dropped[i_eterm] = 0 | |
- | |
0 if findfirst(encountered_columns, dropped) == 0 | |
0 trms.is_non_redundant[i_eterm, i_term] = true | |
0 push!(encountered_columns, dropped) | |
- end | |
- | |
- end | |
- end | |
- ## once we've checked all the eterms in this term, add it to the list | |
- ## of encountered terms/columns | |
0 push!(encountered_columns, Compat.view(trms.factors, :, i_term)) | |
- end | |
- | |
0 return trms.is_non_redundant | |
- end | |
- | |
- | |
- const DEFAULT_CONTRASTS = DummyCoding | |
- | |
- ## Set up contrasts: | |
- ## Combine actual DF columns and contrast types if necessary to compute the | |
- ## actual contrasts matrices, levels, and term names (using DummyCoding | |
- ## as the default) | |
- function evalcontrasts(df::AbstractDataFrame, contrasts::Dict = Dict()) | |
- evaledContrasts = Dict() | |
0 for (term, col) in eachcol(df) | |
0 is_categorical(col) || continue | |
0 evaledContrasts[term] = ContrastsMatrix(haskey(contrasts, term) ? | |
- contrasts[term] : | |
- DEFAULT_CONTRASTS(), | |
- col) | |
- end | |
0 return evaledContrasts | |
- end | |
- | |
- function ModelFrame(trms::Terms, d::AbstractDataFrame; | |
- contrasts::Dict = Dict()) | |
- df, msng = null_omit(DataFrame(map(x -> d[x], trms.eterms))) | |
0 names!(df, convert(Vector{Symbol}, map(string, trms.eterms))) | |
0 for c in eachcol(df) _droplevels!(c[2]) end | |
- | |
0 evaledContrasts = evalcontrasts(df, contrasts) | |
- | |
- ## Check for non-redundant terms, modifying terms in place | |
0 check_non_redundancy!(trms, df) | |
- | |
0 ModelFrame(df, trms, msng, evaledContrasts) | |
- end | |
- | |
- ModelFrame(df::AbstractDataFrame, term::Terms, msng::BitArray) = ModelFrame(df, term, msng, evalcontrasts(df)) | |
- ModelFrame(f::Formula, d::AbstractDataFrame; kwargs...) = ModelFrame(Terms(f), d; kwargs...) | |
- ModelFrame(ex::Expr, d::AbstractDataFrame; kwargs...) = ModelFrame(Formula(ex), d; kwargs...) | |
- | |
- ## modify contrasts in place | |
- function setcontrasts!(mf::ModelFrame, new_contrasts::Dict) | |
- new_contrasts = Dict([ Pair(col, ContrastsMatrix(contr, mf.df[col])) | |
- for (col, contr) in filter((k,v)->haskey(mf.df, k), new_contrasts) ]) | |
- | |
- mf.contrasts = merge(mf.contrasts, new_contrasts) | |
- return mf | |
- end | |
- setcontrasts!(mf::ModelFrame; kwargs...) = setcontrasts!(mf, Dict(kwargs)) | |
- | |
- """ | |
- StatsBase.model_response(mf::ModelFrame) | |
- Extract the response column, if present. `DataVector` or | |
- `PooledDataVector` columns are converted to `Array`s | |
- """ | |
- function StatsBase.model_response(mf::ModelFrame) | |
- if mf.terms.response | |
- convert(Array, mf.df[mf.terms.eterms[1]]) | |
- else | |
- error("Model formula one-sided") | |
- end | |
- end | |
- | |
- function nc(name::Symbol, mf::ModelFrame; non_redundant::Bool=false) | |
- if haskey(mf.contrasts, name) | |
0 size(non_redundant ? | |
- ContrastsMatrix{FullDummyCoding}(mf.contrasts[name]) : | |
- mf.contrasts[name], | |
- 2) | |
- else | |
0 1 | |
- end | |
- end | |
992000 | |
- ## construct model matrix columns from model frame + name (checks for contrasts) | |
- function modelmat_cols{T<:AbstractFloatMatrix}(::Type{T}, name::Symbol, mf::ModelFrame; non_redundant::Bool = false) | |
- if haskey(mf.contrasts, name) | |
0 modelmat_cols(T, mf.df[name], | |
- non_redundant ? | |
- ContrastsMatrix{FullDummyCoding}(mf.contrasts[name]) : | |
- mf.contrasts[name]) | |
- else | |
0 modelmat_cols(T, mf.df[name]) | |
- end | |
- end | |
- | |
- modelmat_cols{T<:AbstractFloatMatrix}(::Type{T}, v::AbstractVector) = | |
- convert(T, reshape(v, length(v), 1)) | |
- # FIXME: this inefficient method should not be needed, cf. JuliaLang/julia#18264 | |
- modelmat_cols{T<:AbstractFloatMatrix}(::Type{T}, v::NullableVector) = | |
- convert(T, Matrix(reshape(v, length(v), 1))) | |
- | |
- """ | |
- modelmat_cols{T<:AbstractFloatMatrix}(::Type{T}, v::PooledDataVector, contrast::ContrastsMatrix) | |
- | |
16000 Construct `ModelMatrix` columns of type `T` based on specified contrasts, ensuring that | |
- levels align properly. | |
- """ | |
- function modelmat_cols{T<:AbstractFloatMatrix}(::Type{T}, | |
- v::Union{CategoricalVector, NullableCategoricalVector}, | |
- contrast::ContrastsMatrix) | |
- ## make sure the levels of the contrast matrix and the categorical data | |
- ## are the same by constructing a re-indexing vector. Indexing into | |
- ## reindex with v.refs will give the corresponding row number of the | |
- ## contrast matrix | |
- reindex = [findfirst(contrast.levels, l) for l in levels(v)] | |
0 contrastmatrix = convert(T, contrast.matrix) | |
0 return indexrows(contrastmatrix, reindex[v.refs]) | |
- end | |
- | |
- indexrows(m::SparseMatrixCSC, ind::Vector{Int}) = m'[:, ind]' | |
- indexrows(m::AbstractMatrix, ind::Vector{Int}) = m[ind, :] | |
- | |
- """ | |
- expandcols{T<:AbstractFloatMatrix}(trm::Vector{T}) | |
- Create pairwise products of columns from a vector of matrices | |
- """ | |
- function expandcols{T<:AbstractFloatMatrix}(trm::Vector{T}) | |
- if length(trm) == 1 | |
0 trm[1] | |
- else | |
0 a = trm[1] | |
0 b = expandcols(trm[2 : end]) | |
0 reduce(hcat, [broadcast(*, a, Compat.view(b, :, j)) for j in 1 : size(b, 2)]) | |
0 end | |
0 end | |
- | |
- """ | |
- droprandomeffects(trms::Terms) | |
- Expressions of the form `(a|b)` are "random-effects" terms and are not | |
- incorporated in the ModelMatrix. This function checks for such terms and, | |
- if any are present, drops them from the `Terms` object. | |
- """ | |
- function droprandomeffects(trms::Terms) | |
- retrms = Bool[Meta.isexpr(t, :call) && t.args[1] == :| for t in trms.terms] | |
128000 if !any(retrms) # return trms unchanged | |
0 trms | |
0 elseif all(retrms) && !trms.response # return an empty Terms object | |
0 Terms(Any[],Any[],Array(Bool, (0,0)),Array(Bool, (0,0)), Int[], false, trms.intercept) | |
- else | |
- # the rows of `trms.factors` correspond to `eterms`, the columns to `terms` | |
- # After dropping random-effects terms we drop any eterms whose rows are all false | |
0 ckeep = !retrms # columns to retain | |
0 facs = trms.factors[:, ckeep] | |
0 rkeep = vec(sum(facs, 2) .> 0) | |
0 Terms(trms.terms[ckeep], trms.eterms[rkeep], facs[rkeep, :], | |
- trms.is_non_redundant[rkeep, ckeep], | |
- trms.order[ckeep], trms.response, trms.intercept) | |
- end | |
- end | |
- | |
- """ | |
- dropresponse!(trms::Terms) | |
- Drop the response term, `trms.eterms[1]` and the first row and column | |
- of `trms.factors` if `trms.response` is true. | |
- """ | |
- function dropresponse!(trms::Terms) | |
- if trms.response | |
0 ckeep = 2:size(trms.factors, 2) | |
0 rkeep = vec(any(trms.factors[:, ckeep], 2)) | |
0 Terms(trms.terms, trms.eterms[rkeep], trms.factors[rkeep, ckeep], | |
- trms.is_non_redundant[rkeep, ckeep], trms.order[ckeep], false, trms.intercept) | |
- else | |
0 trms | |
- end | |
- end | |
- | |
- """ | |
- ModelMatrix{T<:AbstractFloatMatrix}(mf::ModelFrame) | |
- Create a `ModelMatrix` of type `T` (default `Matrix{Float64}`) from the | |
- `terms` and `df` members of `mf`. | |
- | |
- This is basically a map-reduce where terms are mapped to columns by `cols` | |
- and reduced by `hcat`. During the collection of the columns the `assign` | |
- vector is created. `assign` maps columns of the model matrix to terms in | |
- the model frame. It can also be considered as mapping coefficients to | |
- terms and, hence, to names. | |
- | |
- If there is an intercept in the model, that column occurs first and its | |
- `assign` value is zero. | |
- | |
- Mixed-effects models include "random-effects" terms which are ignored when | |
- creating the model matrix. | |
- """ | |
- @compat function (::Type{ModelMatrix{T}}){T<:AbstractFloatMatrix}(mf::ModelFrame) | |
- dfrm = mf.df | |
0 terms = droprandomeffects(dropresponse!(mf.terms)) | |
0 factors = terms.factors | |
- | |
- ## Step 1: | |
- ## compute which columns correspond to which terms in order to pre-allocate | |
- ## model matrix and set up views later | |
80000 term_col_inds = [] | |
0 num_cols = convert(Int, terms.intercept) | |
- | |
240000 for (i_term, term) in enumerate(terms.terms) | |
0 f = view(factors, :, i_term) | |
0 any(f) || continue | |
- | |
480000 block_sizes = map((et,nr) -> nc(et, mf, non_redundant=nr), | |
- view(terms.eterms, f), | |
- view(terms.is_non_redundant, f, i_term)) | |
0 num_term_cols = prod(block_sizes) | |
- | |
176000 push!(term_col_inds, (1:num_term_cols)+num_cols) | |
0 num_cols += num_term_cols | |
- end | |
- | |
0 if num_cols == 0 | |
0 error("Could not construct model matrix. Resulting matrix has 0 columns.") | |
0 end | |
- | |
- # Pre-allocate model matrix | |
24000128000 mm = similar(convert(T, Matrix()), size(dfrm, 1), num_cols) | |
112000 assign = zeros(Int, num_cols) | |
- | |
0 if terms.intercept | |
0 mm[:,1] = 1 | |
- end | |
- | |
- ## Step 2: | |
- ## Fill in the columns for each term. | |
- | |
- ## Map eval. term name + redundancy bool to cached model matrix columns | |
- ## (views of model matrix itself for eval terms that correspond directly to | |
- ## mm columns, and arrays otherwise) | |
0 eterm_cols = @compat Dict{Tuple{Symbol,Bool}, T}() | |
- | |
- ## turn each term into a vector of mm columns for its eval. terms, using | |
- ## "promoted" full-rank versions of categorical columns for non-redundant | |
- ## eval. terms: | |
304000 for (i_term, term) in enumerate(terms.terms) | |
- ## Pull out the eval terms, and the non-redundancy flags for this term | |
0 ff = view(factors, :, i_term) | |
384000 eterms = view(terms.eterms, ff) | |
64000 non_redundants = view(terms.is_non_redundant, ff, i_term) | |
192000 if length(eterms) == 1 && term == eterms[1] | |
- ## handle special case where term == eterm (e.g., main effect | |
- ## terms). in this case, we can store the columns generated for the | |
- ## eterm directly in the model matrix and just hold onto a view of | |
- ## those columns for any higher-order terms that need them | |
0 et = eterms[1] | |
0 nr = non_redundants[1] | |
0 mm[:, term_col_inds[i_term]] = modelmat_cols(T, et, mf, non_redundant=nr) | |
16000384000 eterm_cols[(et, nr)] = view(mm, :, term_col_inds[i_term]) | |
- else | |
- ## general case with multiple eterms and expandcols | |
0 blocks = T[] | |
0 for (et, nr) in zip(eterms, non_redundants) | |
0 block = get!(eterm_cols, (et, nr)) do | |
- modelmat_cols(T, et, mf, non_redundant=nr) | |
- end | |
0 push!(blocks, block) | |
- end | |
0 mm[:, term_col_inds[i_term]] = expandcols(blocks) | |
- end | |
0 assign[term_col_inds[i_term]] = i_term | |
- end | |
- | |
32000 ModelMatrix{T}(mm, assign) | |
- end | |
- ModelMatrix(mf::ModelFrame) = ModelMatrix{Matrix{Float64}}(mf) | |
- | |
- | |
- """ | |
- termnames(term::Symbol, col) | |
- Returns a vector of strings with the names of the coefficients | |
- associated with a term. If the column corresponding to the term | |
- is not a `CategoricalArray` or `NullableCategoricalArray`, | |
- a one-element vector is returned. | |
- """ | |
- termnames(term::Symbol, col) = [string(term)] | |
- function termnames(term::Symbol, mf::ModelFrame; non_redundant::Bool = false) | |
- if haskey(mf.contrasts, term) | |
- termnames(term, mf.df[term], | |
- non_redundant ? | |
- ContrastsMatrix{FullDummyCoding}(mf.contrasts[term]) : | |
- mf.contrasts[term]) | |
- else | |
- termnames(term, mf.df[term]) | |
- end | |
- end | |
- | |
- termnames(term::Symbol, col::Any, contrast::ContrastsMatrix) = | |
- ["$term: $name" for name in contrast.termnames] | |
- | |
- | |
- function expandtermnames(term::Vector) | |
- if length(term) == 1 | |
- return term[1] | |
- else | |
- return foldr((a,b) -> vec([string(lev1, " & ", lev2) for | |
- lev1 in a, | |
- lev2 in b]), | |
- term) | |
- end | |
- end | |
- | |
- | |
- """ | |
- coefnames(mf::ModelFrame) | |
- Returns a vector of coefficient names constructed from the Terms | |
- member and the types of the evaluation columns. | |
- """ | |
- function coefnames(mf::ModelFrame) | |
- terms = droprandomeffects(dropresponse!(mf.terms)) | |
- | |
- ## strategy mirrors ModelMatrx constructor: | |
- eterm_names = @compat Dict{Tuple{Symbol,Bool}, Vector{Compat.UTF8String}}() | |
- term_names = Vector{Compat.UTF8String}[] | |
- | |
- if terms.intercept | |
- push!(term_names, Compat.UTF8String["(Intercept)"]) | |
- end | |
- | |
- factors = terms.factors | |
- | |
- for (i_term, term) in enumerate(terms.terms) | |
- | |
- ## names for columns for eval terms | |
- names = Vector{Compat.UTF8String}[] | |
- | |
- ff = Compat.view(factors, :, i_term) | |
- eterms = Compat.view(terms.eterms, ff) | |
- non_redundants = Compat.view(terms.is_non_redundant, ff, i_term) | |
- | |
- for (et, nr) in zip(eterms, non_redundants) | |
- if !haskey(eterm_names, (et, nr)) | |
- eterm_names[(et, nr)] = termnames(et, mf, non_redundant=nr) | |
- end | |
- push!(names, eterm_names[(et, nr)]) | |
- end | |
- push!(term_names, expandtermnames(names)) | |
- end | |
- | |
- reduce(vcat, Vector{Compat.UTF8String}(), term_names) | |
- end | |
- | |
- function Formula(t::Terms) | |
- lhs = t.response ? t.eterms[1] : nothing | |
- rhs = Expr(:call,:+) | |
- if t.intercept | |
- push!(rhs.args,1) | |
- end | |
- append!(rhs.args,t.terms) | |
- Formula(lhs,rhs) | |
- end | |
- |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using DataFrames | |
using BenchmarkTools | |
using JLD | |
srand(1) | |
n = 1_000_000 | |
d = DataFrame(x = rand(n), y = rand(n)) | |
suite = BenchmarkGroup() | |
suite["ModelFrame"] = BenchmarkGroup() | |
suite["ModelMatrix"] = BenchmarkGroup() | |
for rhs in [ :(x), :(x+y), :(x*y) ] | |
f = Formula(nothing, rhs) | |
suite["ModelFrame"][rhs] = @benchmarkable ModelFrame($f, $d) | |
mf = ModelFrame(f, d) | |
suite["ModelMatrix"][rhs] = @benchmarkable ModelMatrix($mf) | |
end | |
# run these lines the first time to tune the benchmarking parameters: | |
# tune!(suite) | |
# JLD.save(Pkg.dir("DataFrames", "benchmark", "params.jld"), "suite", params(suite)) | |
# then load them for future runs: | |
loadparams!(suite, | |
JLD.load(Pkg.dir("DataFrames", "benchmark", "params.jld"), "suite"), | |
:evals, :samples) | |
results = run(suite, verbose=true) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# run a fixed number of time for memory allocation profiling | |
using DataFrames | |
srand(1) | |
n = 1_000_000 | |
d = DataFrame(x = rand(n), y = rand(n)) | |
f = Formula(nothing, :(x+y)) | |
mf = ModelFrame(f, d) | |
mm = ModelMatrix(mf) | |
Profile.init(delay=0.01) | |
Profile.clear() | |
Profile.clear_malloc_data() | |
@profile @time for _ in 1:1000 | |
ModelMatrix(mf) | |
end | |
# Write profile results to profile.bin. | |
open("profile.bin", "w") do f serialize(f, Profile.retrieve()) end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
julia> j = judge(median(results), median(results_master)) | |
julia> j["ModelMatrix"] | |
3-element BenchmarkTools.BenchmarkGroup: | |
tags: [] | |
:(x + y) => TrialJudgement(+21.64% => regression) | |
:(x * y) => TrialJudgement(+19.32% => regression) | |
:x => TrialJudgement(-14.57% => improvement) | |
julia> j["ModelFrame"] | |
3-element BenchmarkTools.BenchmarkGroup: | |
tags: [] | |
:(x + y) => TrialJudgement(+0.10% => invariant) | |
:(x * y) => TrialJudgement(-0.88% => invariant) | |
:x => TrialJudgement(-1.63% => invariant) | |
julia> results_master["ModelMatrix"][:x] | |
BenchmarkTools.Trial: | |
samples: 477 | |
evals/sample: 1 | |
time tolerance: 5.00% | |
memory tolerance: 1.00% | |
memory estimate: 22.89 mb | |
allocs estimate: 79 | |
minimum time: 8.11 ms (11.03% GC) | |
median time: 10.20 ms (31.64% GC) | |
mean time: 10.49 ms (31.88% GC) | |
maximum time: 17.91 ms (51.26% GC) | |
julia> results["ModelMatrix"][:x] | |
BenchmarkTools.Trial: | |
samples: 557 | |
evals/sample: 1 | |
time tolerance: 5.00% | |
memory tolerance: 1.00% | |
memory estimate: 22.89 mb | |
allocs estimate: 89 | |
minimum time: 8.00 ms (12.85% GC) | |
median time: 8.71 ms (31.05% GC) | |
mean time: 8.98 ms (31.10% GC) | |
maximum time: 13.94 ms (35.56% GC) | |
julia> results_master["ModelMatrix"][:(x+y)] | |
BenchmarkTools.Trial: | |
samples: 356 | |
evals/sample: 1 | |
time tolerance: 5.00% | |
memory tolerance: 1.00% | |
memory estimate: 30.52 mb | |
allocs estimate: 128 | |
minimum time: 12.37 ms (26.39% GC) | |
median time: 13.12 ms (25.51% GC) | |
mean time: 14.05 ms (24.98% GC) | |
maximum time: 224.42 ms (1.35% GC) | |
julia> results["ModelMatrix"][:(x+y)] | |
BenchmarkTools.Trial: | |
samples: 306 | |
evals/sample: 1 | |
time tolerance: 5.00% | |
memory tolerance: 1.00% | |
memory estimate: 38.15 mb | |
allocs estimate: 148 | |
minimum time: 12.63 ms (7.10% GC) | |
median time: 15.95 ms (25.34% GC) | |
mean time: 16.34 ms (25.49% GC) | |
maximum time: 23.80 ms (17.63% GC) | |
julia> results_master["ModelMatrix"][:(x*y)] | |
BenchmarkTools.Trial: | |
samples: 224 | |
evals/sample: 1 | |
time tolerance: 5.00% | |
memory tolerance: 1.00% | |
memory estimate: 45.79 mb | |
allocs estimate: 233 | |
minimum time: 18.26 ms (24.74% GC) | |
median time: 19.28 ms (24.32% GC) | |
mean time: 22.35 ms (23.24% GC) | |
maximum time: 549.82 ms (2.38% GC) | |
julia> results["ModelMatrix"][:(x*y)] | |
BenchmarkTools.Trial: | |
samples: 213 | |
evals/sample: 1 | |
time tolerance: 5.00% | |
memory tolerance: 1.00% | |
memory estimate: 53.42 mb | |
allocs estimate: 286 | |
minimum time: 21.63 ms (25.27% GC) | |
median time: 23.01 ms (25.53% GC) | |
mean time: 23.52 ms (25.68% GC) | |
maximum time: 33.33 ms (5.18% GC) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment