Created
June 23, 2020 23:44
-
-
Save quinnj/eb25a3f7493aab6cb450707c83f0b170 to your computer and use it in GitHub Desktop.
CSV.jl inference issue w/ generated function
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
] add CSV#jq/lazystrings | |
using CSV | |
source = joinpath(dirname(pathof(CSV)), "../randoms.csv") | |
header = 1 | |
normalizenames = false | |
datarow = -1 | |
skipto = nothing | |
footerskip = 0 | |
limit = typemax(Int64) | |
transpose = false | |
comment = nothing | |
use_mmap=true | |
ignoreemptylines=false | |
threaded=false | |
select=[:id] | |
drop=nothing | |
missingstrings=String[] | |
missingstring="" | |
delim=nothing | |
ignorerepeated=false | |
quotechar='"' | |
openquotechar=nothing | |
closequotechar=nothing | |
escapechar='"' | |
dateformat=nothing | |
dateformats=nothing | |
decimal=UInt8('.') | |
truestrings=["true", "True", "TRUE"] | |
falsestrings=["false", "False", "FALSE"] | |
type=nothing | |
types=Dict(1=>Int32) | |
typemap=Dict{Type,Type}() | |
categorical=false | |
pool=false | |
lazystrings=false | |
strict=false | |
silencewarnings=false | |
debug=false | |
parsingdebug=false | |
h = CSV.Header(source, header, normalizenames, datarow, skipto, footerskip, limit, transpose, comment, use_mmap, ignoreemptylines, threaded, select, drop, missingstrings, missingstring, delim, ignorerepeated, quotechar, openquotechar, closequotechar, escapechar, dateformat, dateformats, decimal, truestrings, falsestrings, type, types, typemap, categorical, pool, lazystrings, strict, silencewarnings, debug, parsingdebug, false) | |
rowsguess, ncols, buf, len, datapos, options, coloptions, positions, types, flags, pool, categorical, customtypes = h.rowsguess, h.cols, h.buf, h.len, h.datapos, h.options, h.coloptions, h.positions, h.types, h.flags, h.pool, h.categorical, h.customtypes | |
refs = Vector{CSV.RefPool}(undef, ncols) | |
tapes = CSV.allocate(rowsguess, ncols, types, flags) | |
# here's where we parse the whole file and I currently see 3.43M allocations | |
@time finalrows, pos = CSV.parsetape!(Val(transpose), ncols, typemap, tapes, buf, datapos, len, limit, positions, pool, refs, rowsguess, types, flags, debug, options, coloptions, customtypes) | |
@time finalrows, pos = CSV.parsetape!(Val(transpose), ncols, typemap, tapes, buf, datapos, len, limit, positions, pool, refs, rowsguess, types, flags, debug, options, coloptions, customtypes) | |
# julia> @time finalrows, pos = CSV.parsetape!(Val(transpose), ncols, typemap, tapes, buf, datapos, len, limit, positions, pool, refs, rowsguess, types, flags, debug, options, coloptions, customtypes) | |
# customtypes = Tuple{Tuple{SentinelArrays.SentinelArray{Int32,1,Int32,Missing,Array{Int32,1}},Int32}} | |
# 0.091579 seconds (3.43 M allocations: 85.382 MiB, 8.28% gc time) | |
# (70000, 4545084) | |
# customtypes is a Tuple type of 2-Tuple types, like Tuple{Tuple{ArrayType, ElementType}...} | |
# it's used in parsecustom! to generate inline parsevalue! calls for each ArrayType=>ElementType pair (there will be one for each non-standard type a user requests in parsing) | |
# but code_typed looks great! the call to parsecustom! generated correctly, looks like it inlined and everything is dandy | |
@code_typed debuginfo=:source CSV.parsetape!(Val(transpose), ncols, typemap, tapes, buf, datapos, len, limit, positions, pool, refs, rowsguess, types, flags, debug, options, coloptions, customtypes) | |
# for reference, we can compare the code_typed output for a hard-coded Time column | |
# │ @ /Users/jacobquinn/.julia/dev/CSV/src/file.jl:591 within `parserow' | |
# 45 ──│ %127 = π (%49, SentinelArrays.SentinelArray{Dates.Time,1,Dates.Time,Missing,Array{Dates.Time,1}}) | |
# │ │ %128 = invoke CSV.parsevalue!(CSV.Time::Type{Dates.Time}, %48::UInt8, %127::SentinelArrays.SentinelArray{Dates.Time,1,Dates.Time,Missing,Array{Dates.Time,1}}, _5::Array{AbstractArray{T,1} where T,1}, _6::Array{UInt8,1}, %45::Int64, _8::Int64, _17::Parsers.Options{false,false,true,false,Missing,UInt8,Nothing}, %31::Int64, %43::Int64, _14::Array{Type,1}, _15::Array{UInt8,1})::Tuple{Int64,Int16} | |
# with our custom type generated output | |
# 52 ──││││ %153 = Base.arrayref(false, tapes, %43)::AbstractArray{T,1} where T | |
# │ │││└ | |
# │ │││ @ /Users/jacobquinn/.julia/dev/CSV/src/file.jl:549 within `macro expansion' | |
# │ │││ %154 = (%153 isa SentinelArrays.SentinelArray{Int32,1,Int32,Missing,Array{Int32,1}})::Bool | |
# └────│││ goto #54 if not %154 | |
# │││ @ /Users/jacobquinn/.julia/dev/CSV/src/file.jl:550 within `macro expansion' | |
# 53 ──│││ %156 = π (%153, SentinelArrays.SentinelArray{Int32,1,Int32,Missing,Array{Int32,1}}) | |
# │ │││ %157 = invoke CSV.parsevalue!(Int32::Type{Int32}, %48::UInt8, %156::SentinelArrays.SentinelArray{Int32,1,Int32,Missing,Array{Int32,1}}, _5::Array{AbstractArray{T,1} where T,1}, _6::Array{UInt8,1}, %45::Int64, _8::Int64, _17::Parsers.Options{false,false,true,false,Missing,UInt8,Nothing}, %31::Int64, %43::Int64, _14::Array{Type,1}, _15::Array{UInt8,1})::Tuple{Int64,Int16} | |
# but code_llvm is sad, several jl_box_int64 that seem to account for all the allocations | |
@code_llvm debuginfo=:source CSV.parsetape!(Val(transpose), ncols, typemap, tapes, buf, datapos, len, limit, positions, pool, refs, rowsguess, types, flags, debug, options, coloptions, customtypes) | |
using Profile | |
GC.gc(); GC.gc(); Profile.clear(); @profile CSV.parsetape!(Val(transpose), ncols, typemap, tapes, buf, datapos, len, limit, positions, pool, refs, rowsguess, types, flags, debug, options, coloptions, customtypes) | |
GC.gc(); GC.gc(); Profile.clear(); @profile CSV.parsetape!(Val(transpose), ncols, typemap, tapes, buf, datapos, len, limit, positions, pool, refs, rowsguess, types, flags, debug, options, coloptions, customtypes) | |
# reveals lots of allocations/boxing | |
Profile.print(C=true) | |
# from what I can tell, there's some issue w/ inference not being able to treat the generated `CSV.parsevalue!(Int32::Type{Int32},...` call the same | |
# as our other hard-coded column types. I don't know if that's a constant prop thing, or dataflow analysis or what. But even if you _don't_ call with | |
# a custom type, it still leads to the spike in allocations, because the `row`, `pos`, and possibly `code` variables seem to all get boxed through | |
# the various layers of `parsetape!`, `parserow`, and `parsevalue!`. | |
# I did try avoiding the generated function and just doing a macro with an unrolled `Base.@nexprs`, but that didn't seem to change at all. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment