quinnj · April 17, 2019 22:19
diff --git a/gistfile1.txt b/gistfile1.txt
 module Foo

 delimlen(delim::UInt8) = 1
 delimlen(delim::Tuple{Ptr{UInt8}, Int}) = delim[2]
 checkdelim(delim::UInt8, b, buf, pos, eof) = delim == b
 @inline function checkdelim(delim::Tuple{Ptr{UInt8}, Int}, b, buf, pos, eof)
    startptr = pointer(buf, pos)
    if pos + delim[2] <= eof
        match = memcmp(startptr, delim[1], delim[2])
        if match
            return true
        end
    end
    return false
 end

 overflowval(::Type{T}) where {T <: Integer} = div(typemax(T) - T(9), T(10))

 const ReturnCode = Int16
 ok(x::ReturnCode) = x > 0

 const SUCCESS = 0b0000000000000000 % ReturnCode
 const INVALID = 0b1000000000000000 % ReturnCode

 # success flags
 const OK                   = 0b0000000000000001 % ReturnCode
 const SENTINEL             = 0b0000000000000010 % ReturnCode

 # property flags
 const QUOTED               = 0b0000000000000100 % ReturnCode
 const DELIMITED            = 0b0000000000001000 % ReturnCode
 const NEWLINE              = 0b0000000000010000 % ReturnCode
 const EOF                  = 0b0000000000100000 % ReturnCode
 const ESCAPED_STRING       = 0b0000001000000000 % ReturnCode

 # invalid flags
 const INVALID_QUOTED_FIELD = 0b1000000001000000 % ReturnCode
 const INVALID_DELIMITER    = 0b1000000010000000 % ReturnCode
 const OVERFLOW             = 0b1000000100000000 % ReturnCode

 @inline function defaultparser(::Type{T}, buf, pos, eof, sentinel, wh1::UInt8, wh2::UInt8, oq::UInt8, cq::UInt8, e::UInt8, ignorerepeated::Bool, delim::Union{UInt8, Tuple{Ptr{UInt8}, Int}}) where {T <: Integer}
    startpos = pos
    code = SUCCESS
    x = zero(T)
    neg = false
    quoted = false
    sentinelpos = 0
    if pos >= eof
        code = (sentinel === missing ? SENTINEL : INVALID) | EOF
        @goto donedone
    end
    @inbounds b = buf[pos]
    # strip leading whitespace
    while b == wh1 || b == wh2
        pos += 1
        if pos == eof
            code = INVALID | EOF
            @goto donedone
        end
        @inbounds b = buf[pos]
    end
    # check for start of quoted field
    quoted = b == oq
    if quoted
        code = QUOTED
        pos += 1
        if pos == eof
            code |= INVALID_QUOTED_FIELD
            @goto donedone
        end
        @inbounds b = buf[pos]
        # ignore whitespace within quoted field
        while b == wh1 || b == wh2
            pos += 1
            if pos == eof
                code |= INVALID_QUOTED_FIELD | EOF
                @goto donedone
            end
            @inbounds b = buf[pos]
        end
    end
    # check for sentinel values if applicable
    if sentinel !== nothing && sentinel !== missing
        startptr = pointer(buf, pos)
        # sentinel is an iterable of Tuple{Ptr{UInt8}, Int}, sorted from longest sentinel string to shortest
        for (ptr, len) in sentinel
            if pos + len <= eof
                match = memcmp(startptr, ptr, len)
                if match
                    sentinelpos = pos + len
                    break
                end
            end
        end
    end
    # start actual int parsing
    neg = b == UInt8('-')
    pos += neg || b == UInt8('+')
    if pos == eof
        # eof after stripping whitespace & maybe parsing '-' or '+'
        if sentinel === missing && pos == startpos
            # if we haven't moved any chars, then we count it as a zero-width sentinel match
            code |= SENTINEL | EOF
        elseif sentinel !== nothing && sentinel !== missing && sentinelpos > 0
            # if we matched a sentinel value
            pos = sentinelpos
            code |= SENTINEL | EOF
        else
            # otherwise, it's just an invalid value
            code |= INVALID | EOF
        end
        if quoted
            # if we detected a quote character, it's an invalid quoted field due to eof in the middle
            code |= INVALID_QUOTED_FIELD
        end
        @goto donedone
    end
    @inbounds b = buf[pos] - UInt8('0')
    if b > 0x09
        # character isn't a digit, check for sentinels, otherwise INVALID value
        if sentinel === missing && pos == startpos
            code |= SENTINEL
        elseif sentinel !== nothing && sentinel !== missing && sentinelpos > 0
            pos = sentinelpos
            code |= SENTINEL
        else
            code |= INVALID
        end
        @goto donevalue
    end
    while true
        x = T(10) * x + b
        pos += 1
        if pos == eof
            x = ifelse(neg, -x, x)
            # if we matched a sentinel and it's length is more than the # of digits we parsed
            # we mark SENTINEL (>= length), otherwise mark it as OK value
            if sentinel !== nothing && sentinel !== missing && sentinelpos >= pos
                pos = sentinelpos
                code |= SENTINEL | EOF
            else
                code |= OK | EOF
            end
            if quoted
                # but if we're inside a quoted field, that's invalid
                code |= INVALID_QUOTED_FIELD
            end
            @goto donedone
        end
        @inbounds b = buf[pos] - UInt8('0')
        if b > 0x09
            # detected a non-digit, time to bail on value parsing
            if sentinel !== nothing && sentinel !== missing && sentinelpos >= pos
                pos = sentinelpos
                code |= SENTINEL
            else
                code |= OK
            end
            x = ifelse(neg, -x, x)
            @goto donevalue
        end
        x > overflowval(T) && break
    end
    # extra loop because we got too close to overflowing while parsing digits
    while true
        x, ov_mul = Base.mul_with_overflow(x, T(10))
        x, ov_add = Base.add_with_overflow(x, T(b))
        if ov_mul | ov_add
            # we overflowed, check for valid sentinel, otherwise mark as OVERFLOW
            if sentinel !== nothing && sentinel !== missing && sentinelpos >= pos
                pos = sentinelpos
                code |= SENTINEL
            else
                code |= OVERFLOW
            end
            @goto donevalue
        end
        pos += 1
        if pos == eof
            if sentinel !== nothing && sentinel !== missing && sentinelpos >= pos
                pos = sentinelpos
                code |= SENTINEL | EOF
            else
                code |= OK | EOF
            end
            x = ifelse(neg, -x, x)
            if quoted
                code |= INVALID_QUOTED_FIELD
            end
            @goto donedone
        end
        @inbounds b = buf[pos] - UInt8('0')
        if b > 0x09
            if sentinel !== nothing && sentinel !== missing && sentinelpos >= pos
                pos = sentinelpos
                code |= SENTINEL
            else
                code |= OK
            end
            x = ifelse(neg, -x, x)
            @goto donevalue
        end
    end

 @label donevalue
    # donevalue means we finished parsing a value or sentinel, but didn't reach eof, b is still the current byte
    # strip trailing whitespace
    while b == wh1 || b == wh2
        pos += 1
        if pos == eof
            code |= EOF
            @goto donedone
        end
        @inbounds b = buf[pos]
    end
    # for quoted fields, find the closing quote character
    # we should be positioned at the correct place to find the closing quote character if everything is as it should be
    # if we don't find the quote character immediately, something's wrong, so mark INVALID
    if quoted
        same = cq == e
        first = true
        while true
            pos += 1
            if same && b == e
                if pos == eof
                    code |= EOF
                    if !first
                        code |= INVALID
                    end
                    @goto donedone
                elseif buf[pos] != cq
                    if !first
                        code |= INVALID
                    end
                    break
                end
                pos += 1
            elseif b == e
                if pos == eof
                    code |= INVALID_QUOTED_FIELD | EOF
                    @goto donedone
                end
                pos += 1
            elseif b == cq
                if !first
                    code |= INVALID
                end
                break
            end
            if pos == eof
                code |= INVALID_QUOTED_FIELD | EOF
                @goto donedone
            end
            first = false
            @inbounds b = buf[pos]
        end
        @inbounds b = buf[pos]
        # ignore whitespace after quoted field
        while b == wh1 || b == wh2
            pos += 1
            if pos == eof
                code |= EOF
                @goto donedone
            end
            @inbounds b = buf[pos]
        end
    end

    # now we check for a delimiter; if we don't find it, keep parsing until we do
    if !ignorerepeated
        if checkdelim(delim, b, buf, pos, eof)
            # found the delimiter we were looking for
            code |= DELIMITED
            @goto donedone
        end
    else
        matched = false
        while checkdelim(delim, b, buf, pos, eof)
            matched = true
            pos += delimlen(delim)
            if pos >= eof
                code |= EOF
                break
            end
            @inbounds b = buf[pos]
        end
        if matched
            code |= DELIMITED
            @goto donedone
        end
    end
    # didn't find delimiter, but let's check for a newline character
    if b == UInt8('\n')
        code |= NEWLINE | ifelse(pos + 1 == eof, EOF, SUCCESS)
        @goto donedone
    elseif b == UInt8('\r')
        if buf[pos + 1] == UInt8('\n')
            pos += 1
        end
        code |= NEWLINE | ifelse(pos + 1 == eof, EOF, SUCCESS)
        @goto donedone
    end
    # didn't find delimiter or newline, so we're invalid, keep parsing until we find delimiter, newline, or eof
    while true
        pos += 1
        if pos == eof
            code |= EOF | INVALID_DELIMITER
        end
        @inbounds b = buf[pos]
        if !ignorerepeated
            if checkdelim(delim, b, buf, pos, eof)
                # found the delimiter we were looking for
                code |= DELIMITED | INVALID_DELIMITER
                @goto donedone
            end
        else
            matched = false
            while checkdelim(delim, b, buf, pos, eof)
                matched = true
                pos += delimlen(delim)
                if pos >= eof
                    code |= EOF
                    break
                end
                @inbounds b = buf[pos]
            end
            if matched
                code |= DELIMITED | INVALID_DELIMITER
                @goto donedone
            end
        end
        # didn't find delimiter, but let's check for a newline character
        if b == UInt8('\n')
            code |= NEWLINE | ifelse(pos + 1 == eof, EOF, SUCCESS)
            @goto donedone
        elseif b == UInt8('\r')
            if buf[pos + 1] == UInt8('\n')
                pos += 1
            end
            code |= NEWLINE | ifelse(pos + 1 == eof, EOF, SUCCESS)
            @goto donedone
        end
    end

 @label donedone
    return x, code, pos - startpos
 end

 end # module
	module Foo

	delimlen(delim::UInt8) = 1
	delimlen(delim::Tuple{Ptr{UInt8}, Int}) = delim[2]
	checkdelim(delim::UInt8, b, buf, pos, eof) = delim == b
	@inline function checkdelim(delim::Tuple{Ptr{UInt8}, Int}, b, buf, pos, eof)
	startptr = pointer(buf, pos)
	if pos + delim[2] <= eof
	match = memcmp(startptr, delim[1], delim[2])
	if match
	return true
	end
	end
	return false
	end

	overflowval(::Type{T}) where {T <: Integer} = div(typemax(T) - T(9), T(10))

	const ReturnCode = Int16
	ok(x::ReturnCode) = x > 0

	const SUCCESS = 0b0000000000000000 % ReturnCode
	const INVALID = 0b1000000000000000 % ReturnCode

	# success flags
	const OK = 0b0000000000000001 % ReturnCode
	const SENTINEL = 0b0000000000000010 % ReturnCode

	# property flags
	const QUOTED = 0b0000000000000100 % ReturnCode
	const DELIMITED = 0b0000000000001000 % ReturnCode
	const NEWLINE = 0b0000000000010000 % ReturnCode
	const EOF = 0b0000000000100000 % ReturnCode
	const ESCAPED_STRING = 0b0000001000000000 % ReturnCode

	# invalid flags
	const INVALID_QUOTED_FIELD = 0b1000000001000000 % ReturnCode
	const INVALID_DELIMITER = 0b1000000010000000 % ReturnCode
	const OVERFLOW = 0b1000000100000000 % ReturnCode

	@inline function defaultparser(::Type{T}, buf, pos, eof, sentinel, wh1::UInt8, wh2::UInt8, oq::UInt8, cq::UInt8, e::UInt8, ignorerepeated::Bool, delim::Union{UInt8, Tuple{Ptr{UInt8}, Int}}) where {T <: Integer}
	startpos = pos
	code = SUCCESS
	x = zero(T)
	neg = false
	quoted = false
	sentinelpos = 0
	if pos >= eof
	code = (sentinel === missing ? SENTINEL : INVALID) \| EOF
	@goto donedone
	end
	@inbounds b = buf[pos]
	# strip leading whitespace
	while b == wh1 \|\| b == wh2
	pos += 1
	if pos == eof
	code = INVALID \| EOF
	@goto donedone
	end
	@inbounds b = buf[pos]
	end
	# check for start of quoted field
	quoted = b == oq
	if quoted
	code = QUOTED
	pos += 1
	if pos == eof
	code \|= INVALID_QUOTED_FIELD
	@goto donedone
	end
	@inbounds b = buf[pos]
	# ignore whitespace within quoted field
	while b == wh1 \|\| b == wh2
	pos += 1
	if pos == eof
	code \|= INVALID_QUOTED_FIELD \| EOF
	@goto donedone
	end
	@inbounds b = buf[pos]
	end
	end
	# check for sentinel values if applicable
	if sentinel !== nothing && sentinel !== missing
	startptr = pointer(buf, pos)
	# sentinel is an iterable of Tuple{Ptr{UInt8}, Int}, sorted from longest sentinel string to shortest
	for (ptr, len) in sentinel
	if pos + len <= eof
	match = memcmp(startptr, ptr, len)
	if match
	sentinelpos = pos + len
	break
	end
	end
	end
	end
	# start actual int parsing
	neg = b == UInt8('-')
	pos += neg \|\| b == UInt8('+')
	if pos == eof
	# eof after stripping whitespace & maybe parsing '-' or '+'
	if sentinel === missing && pos == startpos
	# if we haven't moved any chars, then we count it as a zero-width sentinel match
	code \|= SENTINEL \| EOF
	elseif sentinel !== nothing && sentinel !== missing && sentinelpos > 0
	# if we matched a sentinel value
	pos = sentinelpos
	code \|= SENTINEL \| EOF
	else
	# otherwise, it's just an invalid value
	code \|= INVALID \| EOF
	end
	if quoted
	# if we detected a quote character, it's an invalid quoted field due to eof in the middle
	code \|= INVALID_QUOTED_FIELD
	end
	@goto donedone
	end
	@inbounds b = buf[pos] - UInt8('0')
	if b > 0x09
	# character isn't a digit, check for sentinels, otherwise INVALID value
	if sentinel === missing && pos == startpos
	code \|= SENTINEL
	elseif sentinel !== nothing && sentinel !== missing && sentinelpos > 0
	pos = sentinelpos
	code \|= SENTINEL
	else
	code \|= INVALID
	end
	@goto donevalue
	end
	while true
	x = T(10) * x + b
	pos += 1
	if pos == eof
	x = ifelse(neg, -x, x)
	# if we matched a sentinel and it's length is more than the # of digits we parsed
	# we mark SENTINEL (>= length), otherwise mark it as OK value
	if sentinel !== nothing && sentinel !== missing && sentinelpos >= pos
	pos = sentinelpos
	code \|= SENTINEL \| EOF
	else
	code \|= OK \| EOF
	end
	if quoted
	# but if we're inside a quoted field, that's invalid
	code \|= INVALID_QUOTED_FIELD
	end
	@goto donedone
	end
	@inbounds b = buf[pos] - UInt8('0')
	if b > 0x09
	# detected a non-digit, time to bail on value parsing
	if sentinel !== nothing && sentinel !== missing && sentinelpos >= pos
	pos = sentinelpos
	code \|= SENTINEL
	else
	code \|= OK
	end
	x = ifelse(neg, -x, x)
	@goto donevalue
	end
	x > overflowval(T) && break
	end
	# extra loop because we got too close to overflowing while parsing digits
	while true
	x, ov_mul = Base.mul_with_overflow(x, T(10))
	x, ov_add = Base.add_with_overflow(x, T(b))
	if ov_mul \| ov_add
	# we overflowed, check for valid sentinel, otherwise mark as OVERFLOW
	if sentinel !== nothing && sentinel !== missing && sentinelpos >= pos
	pos = sentinelpos
	code \|= SENTINEL
	else
	code \|= OVERFLOW
	end
	@goto donevalue
	end
	pos += 1
	if pos == eof
	if sentinel !== nothing && sentinel !== missing && sentinelpos >= pos
	pos = sentinelpos
	code \|= SENTINEL \| EOF
	else
	code \|= OK \| EOF
	end
	x = ifelse(neg, -x, x)
	if quoted
	code \|= INVALID_QUOTED_FIELD
	end
	@goto donedone
	end
	@inbounds b = buf[pos] - UInt8('0')
	if b > 0x09
	if sentinel !== nothing && sentinel !== missing && sentinelpos >= pos
	pos = sentinelpos
	code \|= SENTINEL
	else
	code \|= OK
	end
	x = ifelse(neg, -x, x)
	@goto donevalue
	end
	end

	@label donevalue
	# donevalue means we finished parsing a value or sentinel, but didn't reach eof, b is still the current byte
	# strip trailing whitespace
	while b == wh1 \|\| b == wh2
	pos += 1
	if pos == eof
	code \|= EOF
	@goto donedone
	end
	@inbounds b = buf[pos]
	end
	# for quoted fields, find the closing quote character
	# we should be positioned at the correct place to find the closing quote character if everything is as it should be
	# if we don't find the quote character immediately, something's wrong, so mark INVALID
	if quoted
	same = cq == e
	first = true
	while true
	pos += 1
	if same && b == e
	if pos == eof
	code \|= EOF
	if !first
	code \|= INVALID
	end
	@goto donedone
	elseif buf[pos] != cq
	if !first
	code \|= INVALID
	end
	break
	end
	pos += 1
	elseif b == e
	if pos == eof
	code \|= INVALID_QUOTED_FIELD \| EOF
	@goto donedone
	end
	pos += 1
	elseif b == cq
	if !first
	code \|= INVALID
	end
	break
	end
	if pos == eof
	code \|= INVALID_QUOTED_FIELD \| EOF
	@goto donedone
	end
	first = false
	@inbounds b = buf[pos]
	end
	@inbounds b = buf[pos]
	# ignore whitespace after quoted field
	while b == wh1 \|\| b == wh2
	pos += 1
	if pos == eof
	code \|= EOF
	@goto donedone
	end
	@inbounds b = buf[pos]
	end
	end

	# now we check for a delimiter; if we don't find it, keep parsing until we do
	if !ignorerepeated
	if checkdelim(delim, b, buf, pos, eof)
	# found the delimiter we were looking for
	code \|= DELIMITED
	@goto donedone
	end
	else
	matched = false
	while checkdelim(delim, b, buf, pos, eof)
	matched = true
	pos += delimlen(delim)
	if pos >= eof
	code \|= EOF
	break
	end
	@inbounds b = buf[pos]
	end
	if matched
	code \|= DELIMITED
	@goto donedone
	end
	end
	# didn't find delimiter, but let's check for a newline character
	if b == UInt8('\n')
	code \|= NEWLINE \| ifelse(pos + 1 == eof, EOF, SUCCESS)
	@goto donedone
	elseif b == UInt8('\r')
	if buf[pos + 1] == UInt8('\n')
	pos += 1
	end
	code \|= NEWLINE \| ifelse(pos + 1 == eof, EOF, SUCCESS)
	@goto donedone
	end
	# didn't find delimiter or newline, so we're invalid, keep parsing until we find delimiter, newline, or eof
	while true
	pos += 1
	if pos == eof
	code \|= EOF \| INVALID_DELIMITER
	end
	@inbounds b = buf[pos]
	if !ignorerepeated
	if checkdelim(delim, b, buf, pos, eof)
	# found the delimiter we were looking for
	code \|= DELIMITED \| INVALID_DELIMITER
	@goto donedone
	end
	else
	matched = false
	while checkdelim(delim, b, buf, pos, eof)
	matched = true
	pos += delimlen(delim)
	if pos >= eof
	code \|= EOF
	break
	end
	@inbounds b = buf[pos]
	end
	if matched
	code \|= DELIMITED \| INVALID_DELIMITER
	@goto donedone
	end
	end
	# didn't find delimiter, but let's check for a newline character
	if b == UInt8('\n')
	code \|= NEWLINE \| ifelse(pos + 1 == eof, EOF, SUCCESS)
	@goto donedone
	elseif b == UInt8('\r')
	if buf[pos + 1] == UInt8('\n')
	pos += 1
	end
	code \|= NEWLINE \| ifelse(pos + 1 == eof, EOF, SUCCESS)
	@goto donedone
	end
	end

	@label donedone
	return x, code, pos - startpos
	end

	end # module