Created
January 16, 2017 05:32
-
-
Save bicycle1885/c57e0a6f1a356fbc8436ba21c7ce7962 to your computer and use it in GitHub Desktop.
[WIP] VCF parser
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import Automa | |
import Automa.RegExp: @re_str | |
const re = Automa.RegExp | |
# VCF v4.2 | |
const vcf_machine = (function () | |
delim(x, sep) = re.opt(re.cat(x, re.rep(re.cat(sep, x)))) | |
nl = re"\n" | |
newline = re"\r?" * nl | |
sampleID = re"[ -~]+" | |
chrom = re"[!-~]+|\." | |
pos = re"[0-9]+|\." | |
id = re"[!-:<-~]+|\." # no semicolon | |
ref = re"[!-~]+|\." | |
alt = re"[!-~]+|\." | |
qual = re"[0-9]+(\.[0-9]+)?|\." | |
filter = re"[!-:<-~]+|\." # no semicolon | |
info = re"[!-:<-~]+|\." # no semicolon | |
format = re"[0-9A-Za-z]+" | |
genotype = re"[ -9;-~]+" # no colon | |
metainfo = re.cat("##", re"[0-9A-Za-z]+", '=', re"[ -~]*") | |
header = re.cat( | |
"#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO", | |
re.opt(re"\tFORMAT" * re.rep(re"\t" * sampleID))) | |
record = re.cat( | |
chrom, '\t', | |
pos, '\t', | |
delim(id, ';'), '\t', | |
ref, '\t', | |
alt, '\t', | |
qual, '\t', | |
delim(filter, ';'), '\t', | |
delim(info, ';'), | |
re.opt(re"\t" * delim(format, ':') * re.rep(re"\t" * delim(genotype, ':')))) | |
vcf = re.cat( | |
delim(metainfo, newline), newline, | |
header, newline, | |
delim(record, newline), newline) | |
nl.actions[:enter] = [:count_line] | |
metainfo.actions[:enter] = [:anchor] | |
metainfo.actions[:exit] = [:metainfo] | |
sampleID.actions[:enter] = [:anchor] | |
sampleID.actions[:exit] = [:sampleID] | |
chrom.actions[:enter] = [:anchor] | |
chrom.actions[:exit] = [:chrom] | |
pos.actions[:enter] = [:anchor] | |
pos.actions[:exit] = [:pos] | |
id.actions[:enter] = [:anchor, :mark] | |
id.actions[:exit] = [:id] | |
ref.actions[:enter] = [:mark] | |
ref.actions[:exit] = [:ref] | |
alt.actions[:enter] = [:mark] | |
alt.actions[:exit] = [:alt] | |
qual.actions[:enter] = [:mark] | |
qual.actions[:exit] = [:qual] | |
filter.actions[:enter] = [:mark] | |
filter.actions[:exit] = [:filter] | |
info.actions[:enter] = [:mark] | |
info.actions[:exit] = [:info] | |
format.actions[:enter] = [:mark] | |
format.actions[:exit] = [:format] | |
genotype.actions[:enter] = [:mark] | |
genotype.actions[:exit] = [:genotype] | |
record.actions[:exit] = [:record] | |
return Automa.compile(vcf) | |
end)() | |
write("vcf.dot", Automa.dfa2dot(vcf_machine.dfa)) | |
run(`dot -Tsvg -o vcf.svg vcf.dot`) | |
# import Bio | |
import BufferedStreams: BufferedInputStream | |
type State{T<:BufferedInputStream} | |
stream::T # input stream | |
cs::Int # current DFA state of Ragel | |
linenum::Int # line number: parser is responsible for updating this | |
finished::Bool # true if finished (regardless of where in the stream we are) | |
end | |
function State(initstate::Int, input::BufferedInputStream) | |
return State(input, initstate, 1, false) | |
end | |
type VCFReader | |
state::State | |
metainfo::Vector{Pair{String,String}} | |
sampleIDs::Vector{String} | |
# temporary fields for parsing | |
offset::Int | |
id::Vector{UnitRange{Int}} | |
filter::Vector{UnitRange{Int}} | |
info::Vector{UnitRange{Int}} | |
format::Vector{UnitRange{Int}} | |
genotype::Vector{UnitRange{Int}} | |
function VCFReader(input::BufferedInputStream) | |
return new(State(vcf_machine.start_state, input), [], [], 0, [], [], [], [], []) | |
end | |
end | |
function add_metainfo!(reader::VCFReader, data, r::UnitRange{Int}) | |
lo, hi = first(r), last(r) | |
@assert data[lo] == data[lo+1] == UInt8('#') | |
pos = 0 | |
for i in lo+2:hi | |
if data[i] == UInt8('=') | |
pos = i | |
break | |
end | |
end | |
@assert pos > 0 | |
key = String(data[lo+2:pos-1]) | |
val = String(data[pos+1:hi]) | |
push!(reader.metainfo, key => val) | |
return reader | |
end | |
function add_sampleID!(reader::VCFReader, data, r::UnitRange{Int}) | |
push!(reader.sampleIDs, data[r]) | |
return reader | |
end | |
function init_record_cache!(reader::VCFReader) | |
reader.offset = 0 | |
empty!(reader.id) | |
empty!(reader.filter) | |
empty!(reader.info) | |
empty!(reader.format) | |
empty!(reader.genotype) | |
return reader | |
end | |
type VCFRecord | |
chrom::String | |
pos::Nullable{Int} | |
# data and ranges | |
data::Vector{UInt8} | |
id::Vector{UnitRange{Int}} | |
ref::UnitRange{Int} | |
alt::UnitRange{Int} | |
qual::Nullable{Float64} | |
filter::Vector{UnitRange{Int}} | |
info::Vector{UnitRange{Int}} | |
format::Vector{UnitRange{Int}} | |
genotype::Vector{UnitRange{Int}} | |
end | |
function VCFRecord() | |
return VCFRecord( | |
".", | |
Nullable{Int}(), | |
[], | |
[], | |
0:-1, | |
0:-1, | |
Nullable{Int}(), | |
[], | |
[], | |
[], | |
[]) | |
end | |
function try_parse_int64(data, r::UnitRange{Int}) | |
lo, hi = first(r), last(r) | |
if data[lo] == UInt8('.') | |
return Nullable{Int64}() | |
elseif data[lo] == UInt8('-') | |
sign = -1 | |
lo += 1 | |
else | |
sign = +1 | |
end | |
x = Int64(0) | |
for i in lo:hi | |
x = 10x + data[i] - UInt8('0') | |
end | |
return Nullable{Int64}(sign * x) | |
end | |
function try_parse_float64(data, r::UnitRange{Int}) | |
return ccall( | |
:jl_try_substrtod, | |
Nullable{Float64}, | |
(Ptr{UInt8}, Csize_t, Csize_t), | |
data, first(r) - 1, length(r)) | |
end | |
function set_chrom!(record, data, r) | |
if length(r) == sizeof(record.chrom) && memcmp(pointer(data, first(r)), pointer(record.chrom), length(r)) == 0 | |
return record | |
end | |
record.chrom = String(data[r]) | |
return record | |
end | |
function memcmp(p1, p2, n) | |
return ccall(:memcmp, Cint, (Ptr{Void}, Ptr{Void}, Csize_t), p1, p2, n) | |
end | |
@inline function anchor!(stream, p) | |
stream.anchor = p | |
end | |
@inline function upanchor!(stream) | |
@assert stream.anchor != 0 "upanchor! called with no anchor set" | |
anchor = stream.anchor | |
stream.anchor = 0 | |
return anchor | |
end | |
vcf_actions = Dict( | |
:count_line => :(linenum += 1), | |
:anchor => :(anchor!(stream, p)), | |
:mark => :(mark = p), | |
:metainfo => :(add_metainfo!(reader, data, upanchor!(stream):p-1)), | |
:sampleID => :(add_sampleID!(reader, data, upanchor!(stream):p-1)), | |
:chrom => :(init_record_cache!(reader); set_chrom!(record, data, upanchor!(stream):p-1)), | |
:pos => :(record.pos = try_parse_int64(data, upanchor!(stream):p-1); reader.offset = p), | |
:id => :(push!(reader.id, (mark:p-1)-reader.offset)), | |
:ref => :(record.ref = (mark:p-1)-reader.offset), | |
:alt => :(record.alt = (mark:p-1)-reader.offset), | |
:qual => :(record.qual = try_parse_float64(data, mark:p-1)), | |
:filter => :(push!(reader.filter, (mark:p-1)-reader.offset)), | |
:info => :(push!(reader.info, (mark:p-1)-reader.offset)), | |
:format => :(push!(reader.format, (mark:p-1)-reader.offset)), | |
:genotype => :(push!(reader.genotype, (mark:p-1)-reader.offset)), | |
:record => :(found_record = true; @escape)) | |
@eval function _read!(reader::VCFReader, state::State, record::VCFRecord) | |
stream = state.stream | |
cs = state.cs | |
linenum = state.linenum | |
p = stream.position | |
p_end = stream.available | |
p_eof = -1 | |
data = stream.buffer | |
found_record = false | |
mark = 0 | |
while true | |
$(Automa.generate_exec_code(vcf_machine, actions=vcf_actions, code=:goto)) | |
state.cs = cs | |
state.linenum = linenum | |
stream.position = p | |
if cs < 0 | |
error("VCF file format error on line ", linenum) | |
elseif cs == 0 | |
state.finished = true | |
else | |
if p > p_end | |
hits_eof = BufferedStreams.fillbuffer!(stream) == 0 | |
p = stream.position | |
p_end = stream.available | |
if hits_eof | |
p_eof = p_end | |
end | |
end | |
end | |
if found_record | |
resize_and_copy!(record.data, data, upanchor!(stream):p-1) | |
resize_and_copy!(record.id, reader.id) | |
resize_and_copy!(record.filter, reader.filter) | |
resize_and_copy!(record.info, reader.info) | |
resize_and_copy!(record.format, reader.format) | |
resize_and_copy!(record.genotype, reader.genotype) | |
break | |
elseif cs == 0 | |
throw(EOFError()) | |
end | |
end | |
return record | |
end | |
function resize_and_copy!(dst, src, r=1:endof(src)) | |
len = length(r) | |
if length(dst) != len | |
resize!(dst, len) | |
end | |
copy!(dst, 1, src, first(r), len) | |
return dst | |
end | |
# "/Users/kenta/bioinfo/data/gatk/CEUTrio.HiSeq.WGS.b37.NA12878.vcf" | |
# "/Users/kenta/vendor/seqan/demos/vcf_io/example.vcf" | |
let | |
reader = VCFReader(BufferedInputStream(open("1000G_omni2.5.b37.vcf"))) | |
record = VCFRecord() | |
# warm-up | |
_read!(reader, reader.state, record) | |
n = 0 | |
@time try | |
while true | |
_read!(reader, reader.state, record) | |
n += 1 | |
end | |
catch ex | |
if isa(ex, EOFError) | |
# pass | |
else | |
rethrow() | |
end | |
end | |
@show record | |
@show String(record.data) | |
@show n | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment