Skip to content

Instantly share code, notes, and snippets.

@bicycle1885
Created January 16, 2017 05:32
Show Gist options
  • Save bicycle1885/c57e0a6f1a356fbc8436ba21c7ce7962 to your computer and use it in GitHub Desktop.
Save bicycle1885/c57e0a6f1a356fbc8436ba21c7ce7962 to your computer and use it in GitHub Desktop.
[WIP] VCF parser
import Automa
import Automa.RegExp: @re_str
const re = Automa.RegExp
# VCF v4.2
const vcf_machine = (function ()
delim(x, sep) = re.opt(re.cat(x, re.rep(re.cat(sep, x))))
nl = re"\n"
newline = re"\r?" * nl
sampleID = re"[ -~]+"
chrom = re"[!-~]+|\."
pos = re"[0-9]+|\."
id = re"[!-:<-~]+|\." # no semicolon
ref = re"[!-~]+|\."
alt = re"[!-~]+|\."
qual = re"[0-9]+(\.[0-9]+)?|\."
filter = re"[!-:<-~]+|\." # no semicolon
info = re"[!-:<-~]+|\." # no semicolon
format = re"[0-9A-Za-z]+"
genotype = re"[ -9;-~]+" # no colon
metainfo = re.cat("##", re"[0-9A-Za-z]+", '=', re"[ -~]*")
header = re.cat(
"#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO",
re.opt(re"\tFORMAT" * re.rep(re"\t" * sampleID)))
record = re.cat(
chrom, '\t',
pos, '\t',
delim(id, ';'), '\t',
ref, '\t',
alt, '\t',
qual, '\t',
delim(filter, ';'), '\t',
delim(info, ';'),
re.opt(re"\t" * delim(format, ':') * re.rep(re"\t" * delim(genotype, ':'))))
vcf = re.cat(
delim(metainfo, newline), newline,
header, newline,
delim(record, newline), newline)
nl.actions[:enter] = [:count_line]
metainfo.actions[:enter] = [:anchor]
metainfo.actions[:exit] = [:metainfo]
sampleID.actions[:enter] = [:anchor]
sampleID.actions[:exit] = [:sampleID]
chrom.actions[:enter] = [:anchor]
chrom.actions[:exit] = [:chrom]
pos.actions[:enter] = [:anchor]
pos.actions[:exit] = [:pos]
id.actions[:enter] = [:anchor, :mark]
id.actions[:exit] = [:id]
ref.actions[:enter] = [:mark]
ref.actions[:exit] = [:ref]
alt.actions[:enter] = [:mark]
alt.actions[:exit] = [:alt]
qual.actions[:enter] = [:mark]
qual.actions[:exit] = [:qual]
filter.actions[:enter] = [:mark]
filter.actions[:exit] = [:filter]
info.actions[:enter] = [:mark]
info.actions[:exit] = [:info]
format.actions[:enter] = [:mark]
format.actions[:exit] = [:format]
genotype.actions[:enter] = [:mark]
genotype.actions[:exit] = [:genotype]
record.actions[:exit] = [:record]
return Automa.compile(vcf)
end)()
write("vcf.dot", Automa.dfa2dot(vcf_machine.dfa))
run(`dot -Tsvg -o vcf.svg vcf.dot`)
# import Bio
import BufferedStreams: BufferedInputStream
type State{T<:BufferedInputStream}
stream::T # input stream
cs::Int # current DFA state of Ragel
linenum::Int # line number: parser is responsible for updating this
finished::Bool # true if finished (regardless of where in the stream we are)
end
function State(initstate::Int, input::BufferedInputStream)
return State(input, initstate, 1, false)
end
type VCFReader
state::State
metainfo::Vector{Pair{String,String}}
sampleIDs::Vector{String}
# temporary fields for parsing
offset::Int
id::Vector{UnitRange{Int}}
filter::Vector{UnitRange{Int}}
info::Vector{UnitRange{Int}}
format::Vector{UnitRange{Int}}
genotype::Vector{UnitRange{Int}}
function VCFReader(input::BufferedInputStream)
return new(State(vcf_machine.start_state, input), [], [], 0, [], [], [], [], [])
end
end
function add_metainfo!(reader::VCFReader, data, r::UnitRange{Int})
lo, hi = first(r), last(r)
@assert data[lo] == data[lo+1] == UInt8('#')
pos = 0
for i in lo+2:hi
if data[i] == UInt8('=')
pos = i
break
end
end
@assert pos > 0
key = String(data[lo+2:pos-1])
val = String(data[pos+1:hi])
push!(reader.metainfo, key => val)
return reader
end
function add_sampleID!(reader::VCFReader, data, r::UnitRange{Int})
push!(reader.sampleIDs, data[r])
return reader
end
function init_record_cache!(reader::VCFReader)
reader.offset = 0
empty!(reader.id)
empty!(reader.filter)
empty!(reader.info)
empty!(reader.format)
empty!(reader.genotype)
return reader
end
type VCFRecord
chrom::String
pos::Nullable{Int}
# data and ranges
data::Vector{UInt8}
id::Vector{UnitRange{Int}}
ref::UnitRange{Int}
alt::UnitRange{Int}
qual::Nullable{Float64}
filter::Vector{UnitRange{Int}}
info::Vector{UnitRange{Int}}
format::Vector{UnitRange{Int}}
genotype::Vector{UnitRange{Int}}
end
function VCFRecord()
return VCFRecord(
".",
Nullable{Int}(),
[],
[],
0:-1,
0:-1,
Nullable{Int}(),
[],
[],
[],
[])
end
function try_parse_int64(data, r::UnitRange{Int})
lo, hi = first(r), last(r)
if data[lo] == UInt8('.')
return Nullable{Int64}()
elseif data[lo] == UInt8('-')
sign = -1
lo += 1
else
sign = +1
end
x = Int64(0)
for i in lo:hi
x = 10x + data[i] - UInt8('0')
end
return Nullable{Int64}(sign * x)
end
function try_parse_float64(data, r::UnitRange{Int})
return ccall(
:jl_try_substrtod,
Nullable{Float64},
(Ptr{UInt8}, Csize_t, Csize_t),
data, first(r) - 1, length(r))
end
function set_chrom!(record, data, r)
if length(r) == sizeof(record.chrom) && memcmp(pointer(data, first(r)), pointer(record.chrom), length(r)) == 0
return record
end
record.chrom = String(data[r])
return record
end
function memcmp(p1, p2, n)
return ccall(:memcmp, Cint, (Ptr{Void}, Ptr{Void}, Csize_t), p1, p2, n)
end
@inline function anchor!(stream, p)
stream.anchor = p
end
@inline function upanchor!(stream)
@assert stream.anchor != 0 "upanchor! called with no anchor set"
anchor = stream.anchor
stream.anchor = 0
return anchor
end
vcf_actions = Dict(
:count_line => :(linenum += 1),
:anchor => :(anchor!(stream, p)),
:mark => :(mark = p),
:metainfo => :(add_metainfo!(reader, data, upanchor!(stream):p-1)),
:sampleID => :(add_sampleID!(reader, data, upanchor!(stream):p-1)),
:chrom => :(init_record_cache!(reader); set_chrom!(record, data, upanchor!(stream):p-1)),
:pos => :(record.pos = try_parse_int64(data, upanchor!(stream):p-1); reader.offset = p),
:id => :(push!(reader.id, (mark:p-1)-reader.offset)),
:ref => :(record.ref = (mark:p-1)-reader.offset),
:alt => :(record.alt = (mark:p-1)-reader.offset),
:qual => :(record.qual = try_parse_float64(data, mark:p-1)),
:filter => :(push!(reader.filter, (mark:p-1)-reader.offset)),
:info => :(push!(reader.info, (mark:p-1)-reader.offset)),
:format => :(push!(reader.format, (mark:p-1)-reader.offset)),
:genotype => :(push!(reader.genotype, (mark:p-1)-reader.offset)),
:record => :(found_record = true; @escape))
@eval function _read!(reader::VCFReader, state::State, record::VCFRecord)
stream = state.stream
cs = state.cs
linenum = state.linenum
p = stream.position
p_end = stream.available
p_eof = -1
data = stream.buffer
found_record = false
mark = 0
while true
$(Automa.generate_exec_code(vcf_machine, actions=vcf_actions, code=:goto))
state.cs = cs
state.linenum = linenum
stream.position = p
if cs < 0
error("VCF file format error on line ", linenum)
elseif cs == 0
state.finished = true
else
if p > p_end
hits_eof = BufferedStreams.fillbuffer!(stream) == 0
p = stream.position
p_end = stream.available
if hits_eof
p_eof = p_end
end
end
end
if found_record
resize_and_copy!(record.data, data, upanchor!(stream):p-1)
resize_and_copy!(record.id, reader.id)
resize_and_copy!(record.filter, reader.filter)
resize_and_copy!(record.info, reader.info)
resize_and_copy!(record.format, reader.format)
resize_and_copy!(record.genotype, reader.genotype)
break
elseif cs == 0
throw(EOFError())
end
end
return record
end
function resize_and_copy!(dst, src, r=1:endof(src))
len = length(r)
if length(dst) != len
resize!(dst, len)
end
copy!(dst, 1, src, first(r), len)
return dst
end
# "/Users/kenta/bioinfo/data/gatk/CEUTrio.HiSeq.WGS.b37.NA12878.vcf"
# "/Users/kenta/vendor/seqan/demos/vcf_io/example.vcf"
let
reader = VCFReader(BufferedInputStream(open("1000G_omni2.5.b37.vcf")))
record = VCFRecord()
# warm-up
_read!(reader, reader.state, record)
n = 0
@time try
while true
_read!(reader, reader.state, record)
n += 1
end
catch ex
if isa(ex, EOFError)
# pass
else
rethrow()
end
end
@show record
@show String(record.data)
@show n
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment