bicycle1885 · January 16, 2017 05:32
diff --git a/vcf.jl b/vcf.jl
 import Automa
 import Automa.RegExp: @re_str
 const re = Automa.RegExp

 # VCF v4.2
 const vcf_machine = (function ()
    delim(x, sep) = re.opt(re.cat(x, re.rep(re.cat(sep, x))))

    nl = re"\n"
    newline = re"\r?" * nl

    sampleID = re"[ -~]+"
    chrom    = re"[!-~]+|\."
    pos      = re"[0-9]+|\."
    id       = re"[!-:<-~]+|\."  # no semicolon
    ref      = re"[!-~]+|\."
    alt      = re"[!-~]+|\."
    qual     = re"[0-9]+(\.[0-9]+)?|\."
    filter   = re"[!-:<-~]+|\."  # no semicolon
    info     = re"[!-:<-~]+|\."  # no semicolon
    format   = re"[0-9A-Za-z]+"
    genotype = re"[ -9;-~]+"     # no colon

    metainfo = re.cat("##", re"[0-9A-Za-z]+", '=', re"[ -~]*")
    header = re.cat(
        "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO",
        re.opt(re"\tFORMAT" * re.rep(re"\t" * sampleID)))
    record = re.cat(
        chrom,              '\t',
        pos,                '\t',
        delim(id, ';'),     '\t',
        ref,                '\t',
        alt,                '\t',
        qual,               '\t',
        delim(filter, ';'), '\t',
        delim(info,   ';'),
        re.opt(re"\t" * delim(format, ':') * re.rep(re"\t" * delim(genotype, ':'))))

    vcf = re.cat(
        delim(metainfo, newline), newline,
        header, newline,
        delim(record, newline), newline)

    nl.actions[:enter]       = [:count_line]
    metainfo.actions[:enter] = [:anchor]
    metainfo.actions[:exit]  = [:metainfo]
    sampleID.actions[:enter] = [:anchor]
    sampleID.actions[:exit]  = [:sampleID]
    chrom.actions[:enter]    = [:anchor]
    chrom.actions[:exit]     = [:chrom]
    pos.actions[:enter]      = [:anchor]
    pos.actions[:exit]       = [:pos]
    id.actions[:enter]       = [:anchor, :mark]
    id.actions[:exit]        = [:id]
    ref.actions[:enter]      = [:mark]
    ref.actions[:exit]       = [:ref]
    alt.actions[:enter]      = [:mark]
    alt.actions[:exit]       = [:alt]
    qual.actions[:enter]     = [:mark]
    qual.actions[:exit]      = [:qual]
    filter.actions[:enter]   = [:mark]
    filter.actions[:exit]    = [:filter]
    info.actions[:enter]     = [:mark]
    info.actions[:exit]      = [:info]
    format.actions[:enter]   = [:mark]
    format.actions[:exit]    = [:format]
    genotype.actions[:enter] = [:mark]
    genotype.actions[:exit]  = [:genotype]
    record.actions[:exit]    = [:record]

    return Automa.compile(vcf)
 end)()

 write("vcf.dot", Automa.dfa2dot(vcf_machine.dfa))
 run(`dot -Tsvg -o vcf.svg vcf.dot`)

 # import Bio
 import BufferedStreams: BufferedInputStream

 type State{T<:BufferedInputStream}
    stream::T      # input stream
    cs::Int        # current DFA state of Ragel
    linenum::Int   # line number: parser is responsible for updating this
    finished::Bool # true if finished (regardless of where in the stream we are)
 end

 function State(initstate::Int, input::BufferedInputStream)
    return State(input, initstate, 1, false)
 end

 type VCFReader
    state::State
    metainfo::Vector{Pair{String,String}}
    sampleIDs::Vector{String}

    # temporary fields for parsing
    offset::Int
    id::Vector{UnitRange{Int}}
    filter::Vector{UnitRange{Int}}
    info::Vector{UnitRange{Int}}
    format::Vector{UnitRange{Int}}
    genotype::Vector{UnitRange{Int}}

    function VCFReader(input::BufferedInputStream)
        return new(State(vcf_machine.start_state, input), [], [], 0, [], [], [], [], [])
    end
 end

 function add_metainfo!(reader::VCFReader, data, r::UnitRange{Int})
    lo, hi = first(r), last(r)
    @assert data[lo] == data[lo+1] == UInt8('#')
    pos = 0
    for i in lo+2:hi
        if data[i] == UInt8('=')
            pos = i
            break
        end
    end
    @assert pos > 0
    key = String(data[lo+2:pos-1])
    val = String(data[pos+1:hi])
    push!(reader.metainfo, key => val)
    return reader
 end

 function add_sampleID!(reader::VCFReader, data, r::UnitRange{Int})
    push!(reader.sampleIDs, data[r])
    return reader
 end

 function init_record_cache!(reader::VCFReader)
    reader.offset = 0
    empty!(reader.id)
    empty!(reader.filter)
    empty!(reader.info)
    empty!(reader.format)
    empty!(reader.genotype)
    return reader
 end

 type VCFRecord
    chrom::String
    pos::Nullable{Int}

    # data and ranges
    data::Vector{UInt8}
    id::Vector{UnitRange{Int}}
    ref::UnitRange{Int}
    alt::UnitRange{Int}
    qual::Nullable{Float64}
    filter::Vector{UnitRange{Int}}
    info::Vector{UnitRange{Int}}
    format::Vector{UnitRange{Int}}
    genotype::Vector{UnitRange{Int}}
 end

 function VCFRecord()
    return VCFRecord(
        ".",
        Nullable{Int}(),
        [],
        [],
        0:-1,
        0:-1,
        Nullable{Int}(),
        [],
        [],
        [],
        [])
 end

 function try_parse_int64(data, r::UnitRange{Int})
    lo, hi = first(r), last(r)
    if data[lo] == UInt8('.')
        return Nullable{Int64}()
    elseif data[lo] == UInt8('-')
        sign = -1
        lo += 1
    else
        sign = +1
    end
    x = Int64(0)
    for i in lo:hi
        x = 10x + data[i] - UInt8('0')
    end
    return Nullable{Int64}(sign * x)
 end

 function try_parse_float64(data, r::UnitRange{Int})
    return ccall(
        :jl_try_substrtod,
        Nullable{Float64},
        (Ptr{UInt8}, Csize_t, Csize_t),
        data, first(r) - 1, length(r))
 end

 function set_chrom!(record, data, r)
    if length(r) == sizeof(record.chrom) && memcmp(pointer(data, first(r)), pointer(record.chrom), length(r)) == 0
        return record
    end
    record.chrom = String(data[r])
    return record
 end

 function memcmp(p1, p2, n)
    return ccall(:memcmp, Cint, (Ptr{Void}, Ptr{Void}, Csize_t), p1, p2, n)
 end

 @inline function anchor!(stream, p)
    stream.anchor = p
 end

 @inline function upanchor!(stream)
    @assert stream.anchor != 0 "upanchor! called with no anchor set"
    anchor = stream.anchor
    stream.anchor = 0
    return anchor
 end

 vcf_actions = Dict(
    :count_line => :(linenum += 1),
    :anchor     => :(anchor!(stream, p)),
    :mark       => :(mark = p),
    :metainfo   => :(add_metainfo!(reader, data, upanchor!(stream):p-1)),
    :sampleID   => :(add_sampleID!(reader, data, upanchor!(stream):p-1)),
    :chrom      => :(init_record_cache!(reader); set_chrom!(record, data, upanchor!(stream):p-1)),
    :pos        => :(record.pos = try_parse_int64(data, upanchor!(stream):p-1); reader.offset = p),
    :id         => :(push!(reader.id, (mark:p-1)-reader.offset)),
    :ref        => :(record.ref = (mark:p-1)-reader.offset),
    :alt        => :(record.alt = (mark:p-1)-reader.offset),
    :qual       => :(record.qual = try_parse_float64(data, mark:p-1)),
    :filter     => :(push!(reader.filter, (mark:p-1)-reader.offset)),
    :info       => :(push!(reader.info, (mark:p-1)-reader.offset)),
    :format     => :(push!(reader.format, (mark:p-1)-reader.offset)),
    :genotype   => :(push!(reader.genotype, (mark:p-1)-reader.offset)),
    :record     => :(found_record = true; @escape))

 @eval function _read!(reader::VCFReader, state::State, record::VCFRecord)
    stream = state.stream
    cs = state.cs
    linenum = state.linenum
    p = stream.position
    p_end = stream.available
    p_eof = -1
    data = stream.buffer
    found_record = false
    mark = 0

    while true
        $(Automa.generate_exec_code(vcf_machine, actions=vcf_actions, code=:goto))

        state.cs = cs
        state.linenum = linenum
        stream.position = p

        if cs < 0
            error("VCF file format error on line ", linenum)
        elseif cs == 0
            state.finished = true
        else
            if p > p_end
                hits_eof = BufferedStreams.fillbuffer!(stream) == 0
                p = stream.position
                p_end = stream.available
                if hits_eof
                    p_eof = p_end
                end
            end
        end

        if found_record
            resize_and_copy!(record.data, data, upanchor!(stream):p-1)
            resize_and_copy!(record.id, reader.id)
            resize_and_copy!(record.filter, reader.filter)
            resize_and_copy!(record.info, reader.info)
            resize_and_copy!(record.format, reader.format)
            resize_and_copy!(record.genotype, reader.genotype)
            break
        elseif cs == 0
            throw(EOFError())
        end
    end

    return record
 end

 function resize_and_copy!(dst, src, r=1:endof(src))
    len = length(r)
    if length(dst) != len
        resize!(dst, len)
    end
    copy!(dst, 1, src, first(r), len)
    return dst
 end

 # "/Users/kenta/bioinfo/data/gatk/CEUTrio.HiSeq.WGS.b37.NA12878.vcf"
 # "/Users/kenta/vendor/seqan/demos/vcf_io/example.vcf"
 let
    reader = VCFReader(BufferedInputStream(open("1000G_omni2.5.b37.vcf")))
    record = VCFRecord()
    # warm-up
    _read!(reader, reader.state, record)
    n = 0
    @time try
        while true
            _read!(reader, reader.state, record)
            n += 1
        end
    catch ex
        if isa(ex, EOFError)
            # pass
        else
            rethrow()
        end
    end
    @show record
    @show String(record.data)
    @show n
 end
	import Automa
	import Automa.RegExp: @re_str
	const re = Automa.RegExp

	# VCF v4.2
	const vcf_machine = (function ()
	delim(x, sep) = re.opt(re.cat(x, re.rep(re.cat(sep, x))))

	nl = re"\n"
	newline = re"\r?" * nl

	sampleID = re"[ -~]+"
	chrom = re"[!-~]+\|\."
	pos = re"[0-9]+\|\."
	id = re"[!-:<-~]+\|\." # no semicolon
	ref = re"[!-~]+\|\."
	alt = re"[!-~]+\|\."
	qual = re"[0-9]+(\.[0-9]+)?\|\."
	filter = re"[!-:<-~]+\|\." # no semicolon
	info = re"[!-:<-~]+\|\." # no semicolon
	format = re"[0-9A-Za-z]+"
	genotype = re"[ -9;-~]+" # no colon

	metainfo = re.cat("##", re"[0-9A-Za-z]+", '=', re"[ -~]*")
	header = re.cat(
	"#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO",
	re.opt(re"\tFORMAT" * re.rep(re"\t" * sampleID)))
	record = re.cat(
	chrom, '\t',
	pos, '\t',
	delim(id, ';'), '\t',
	ref, '\t',
	alt, '\t',
	qual, '\t',
	delim(filter, ';'), '\t',
	delim(info, ';'),
	re.opt(re"\t" * delim(format, ':') * re.rep(re"\t" * delim(genotype, ':'))))

	vcf = re.cat(
	delim(metainfo, newline), newline,
	header, newline,
	delim(record, newline), newline)

	nl.actions[:enter] = [:count_line]
	metainfo.actions[:enter] = [:anchor]
	metainfo.actions[:exit] = [:metainfo]
	sampleID.actions[:enter] = [:anchor]
	sampleID.actions[:exit] = [:sampleID]
	chrom.actions[:enter] = [:anchor]
	chrom.actions[:exit] = [:chrom]
	pos.actions[:enter] = [:anchor]
	pos.actions[:exit] = [:pos]
	id.actions[:enter] = [:anchor, :mark]
	id.actions[:exit] = [:id]
	ref.actions[:enter] = [:mark]
	ref.actions[:exit] = [:ref]
	alt.actions[:enter] = [:mark]
	alt.actions[:exit] = [:alt]
	qual.actions[:enter] = [:mark]
	qual.actions[:exit] = [:qual]
	filter.actions[:enter] = [:mark]
	filter.actions[:exit] = [:filter]
	info.actions[:enter] = [:mark]
	info.actions[:exit] = [:info]
	format.actions[:enter] = [:mark]
	format.actions[:exit] = [:format]
	genotype.actions[:enter] = [:mark]
	genotype.actions[:exit] = [:genotype]
	record.actions[:exit] = [:record]

	return Automa.compile(vcf)
	end)()

	write("vcf.dot", Automa.dfa2dot(vcf_machine.dfa))
	run(`dot -Tsvg -o vcf.svg vcf.dot`)

	# import Bio
	import BufferedStreams: BufferedInputStream

	type State{T<:BufferedInputStream}
	stream::T # input stream
	cs::Int # current DFA state of Ragel
	linenum::Int # line number: parser is responsible for updating this
	finished::Bool # true if finished (regardless of where in the stream we are)
	end

	function State(initstate::Int, input::BufferedInputStream)
	return State(input, initstate, 1, false)
	end

	type VCFReader
	state::State
	metainfo::Vector{Pair{String,String}}
	sampleIDs::Vector{String}

	# temporary fields for parsing
	offset::Int
	id::Vector{UnitRange{Int}}
	filter::Vector{UnitRange{Int}}
	info::Vector{UnitRange{Int}}
	format::Vector{UnitRange{Int}}
	genotype::Vector{UnitRange{Int}}

	function VCFReader(input::BufferedInputStream)
	return new(State(vcf_machine.start_state, input), [], [], 0, [], [], [], [], [])
	end
	end

	function add_metainfo!(reader::VCFReader, data, r::UnitRange{Int})
	lo, hi = first(r), last(r)
	@assert data[lo] == data[lo+1] == UInt8('#')
	pos = 0
	for i in lo+2:hi
	if data[i] == UInt8('=')
	pos = i
	break
	end
	end
	@assert pos > 0
	key = String(data[lo+2:pos-1])
	val = String(data[pos+1:hi])
	push!(reader.metainfo, key => val)
	return reader
	end

	function add_sampleID!(reader::VCFReader, data, r::UnitRange{Int})
	push!(reader.sampleIDs, data[r])
	return reader
	end

	function init_record_cache!(reader::VCFReader)
	reader.offset = 0
	empty!(reader.id)
	empty!(reader.filter)
	empty!(reader.info)
	empty!(reader.format)
	empty!(reader.genotype)
	return reader
	end

	type VCFRecord
	chrom::String
	pos::Nullable{Int}

	# data and ranges
	data::Vector{UInt8}
	id::Vector{UnitRange{Int}}
	ref::UnitRange{Int}
	alt::UnitRange{Int}
	qual::Nullable{Float64}
	filter::Vector{UnitRange{Int}}
	info::Vector{UnitRange{Int}}
	format::Vector{UnitRange{Int}}
	genotype::Vector{UnitRange{Int}}
	end

	function VCFRecord()
	return VCFRecord(
	".",
	Nullable{Int}(),
	[],
	[],
	0:-1,
	0:-1,
	Nullable{Int}(),
	[],
	[],
	[],
	[])
	end

	function try_parse_int64(data, r::UnitRange{Int})
	lo, hi = first(r), last(r)
	if data[lo] == UInt8('.')
	return Nullable{Int64}()
	elseif data[lo] == UInt8('-')
	sign = -1
	lo += 1
	else
	sign = +1
	end
	x = Int64(0)
	for i in lo:hi
	x = 10x + data[i] - UInt8('0')
	end
	return Nullable{Int64}(sign * x)
	end

	function try_parse_float64(data, r::UnitRange{Int})
	return ccall(
	:jl_try_substrtod,
	Nullable{Float64},
	(Ptr{UInt8}, Csize_t, Csize_t),
	data, first(r) - 1, length(r))
	end

	function set_chrom!(record, data, r)
	if length(r) == sizeof(record.chrom) && memcmp(pointer(data, first(r)), pointer(record.chrom), length(r)) == 0
	return record
	end
	record.chrom = String(data[r])
	return record
	end

	function memcmp(p1, p2, n)
	return ccall(:memcmp, Cint, (Ptr{Void}, Ptr{Void}, Csize_t), p1, p2, n)
	end

	@inline function anchor!(stream, p)
	stream.anchor = p
	end

	@inline function upanchor!(stream)
	@assert stream.anchor != 0 "upanchor! called with no anchor set"
	anchor = stream.anchor
	stream.anchor = 0
	return anchor
	end

	vcf_actions = Dict(
	:count_line => :(linenum += 1),
	:anchor => :(anchor!(stream, p)),
	:mark => :(mark = p),
	:metainfo => :(add_metainfo!(reader, data, upanchor!(stream):p-1)),
	:sampleID => :(add_sampleID!(reader, data, upanchor!(stream):p-1)),
	:chrom => :(init_record_cache!(reader); set_chrom!(record, data, upanchor!(stream):p-1)),
	:pos => :(record.pos = try_parse_int64(data, upanchor!(stream):p-1); reader.offset = p),
	:id => :(push!(reader.id, (mark:p-1)-reader.offset)),
	:ref => :(record.ref = (mark:p-1)-reader.offset),
	:alt => :(record.alt = (mark:p-1)-reader.offset),
	:qual => :(record.qual = try_parse_float64(data, mark:p-1)),
	:filter => :(push!(reader.filter, (mark:p-1)-reader.offset)),
	:info => :(push!(reader.info, (mark:p-1)-reader.offset)),
	:format => :(push!(reader.format, (mark:p-1)-reader.offset)),
	:genotype => :(push!(reader.genotype, (mark:p-1)-reader.offset)),
	:record => :(found_record = true; @escape))

	@eval function _read!(reader::VCFReader, state::State, record::VCFRecord)
	stream = state.stream
	cs = state.cs
	linenum = state.linenum
	p = stream.position
	p_end = stream.available
	p_eof = -1
	data = stream.buffer
	found_record = false
	mark = 0

	while true
	$(Automa.generate_exec_code(vcf_machine, actions=vcf_actions, code=:goto))

	state.cs = cs
	state.linenum = linenum
	stream.position = p

	if cs < 0
	error("VCF file format error on line ", linenum)
	elseif cs == 0
	state.finished = true
	else
	if p > p_end
	hits_eof = BufferedStreams.fillbuffer!(stream) == 0
	p = stream.position
	p_end = stream.available
	if hits_eof
	p_eof = p_end
	end
	end
	end

	if found_record
	resize_and_copy!(record.data, data, upanchor!(stream):p-1)
	resize_and_copy!(record.id, reader.id)
	resize_and_copy!(record.filter, reader.filter)
	resize_and_copy!(record.info, reader.info)
	resize_and_copy!(record.format, reader.format)
	resize_and_copy!(record.genotype, reader.genotype)
	break
	elseif cs == 0
	throw(EOFError())
	end
	end

	return record
	end

	function resize_and_copy!(dst, src, r=1:endof(src))
	len = length(r)
	if length(dst) != len
	resize!(dst, len)
	end
	copy!(dst, 1, src, first(r), len)
	return dst
	end

	# "/Users/kenta/bioinfo/data/gatk/CEUTrio.HiSeq.WGS.b37.NA12878.vcf"
	# "/Users/kenta/vendor/seqan/demos/vcf_io/example.vcf"
	let
	reader = VCFReader(BufferedInputStream(open("1000G_omni2.5.b37.vcf")))
	record = VCFRecord()
	# warm-up
	_read!(reader, reader.state, record)
	n = 0
	@time try
	while true
	_read!(reader, reader.state, record)
	n += 1
	end
	catch ex
	if isa(ex, EOFError)
	# pass
	else
	rethrow()
	end
	end
	@show record
	@show String(record.data)
	@show n
	end