Created
January 31, 2017 17:56
-
-
Save bicycle1885/5080e87c9cb95fc2d6664864a9ef711f to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import Automa | |
import Automa.RegExp: @re_str | |
import BufferedStreams | |
import BufferedStreams: BufferedInputStream | |
import Bio | |
const fastq_machine = (function () | |
re = Automa.RegExp | |
cat = re.cat | |
rep = re.rep | |
rep1 = re.rep1 | |
opt = re.opt | |
lf = re"\n" | |
lf.actions[:enter] = [:countline] | |
newline = cat(opt('\r'), lf) | |
hspace = re"[ \t\v]" | |
nonspace = re.any() \ re.space() | |
identifier1 = rep1(nonspace) | |
identifier1.actions[:enter] = [:mark] | |
identifier1.actions[:exit] = [:identifier1] | |
description1 = cat(nonspace, re"[^\r\n]*") | |
description1.actions[:enter] = [:mark] | |
description1.actions[:exit] = [:description1] | |
identifier2 = rep1(nonspace) | |
identifier2.actions[:enter] = [:mark] | |
identifier2.actions[:exit] = [:identifier2] | |
description2 = cat(nonspace, re"[^\r\n]*") | |
description2.actions[:enter] = [:mark] | |
description2.actions[:exit] = [:description2] | |
sequence = let | |
letters = re"[A-z]+" | |
letters.actions[:enter] = [:mark] | |
letters.actions[:exit] = [:letters] | |
cat(opt(letters), rep(cat(rep1(newline), letters))) | |
end | |
sequence.actions[:enter] = [:mark2] | |
sequence.actions[:exit] = [:sequence] | |
quality = let | |
qletters = re"[!-~]+" | |
qletters.actions[:enter] = [:mark] | |
qletters.actions[:exit] = [:letters] | |
cat(opt(qletters), rep(cat(rep1(newline), qletters))) | |
end | |
quality.actions[:enter] = [:mark2] | |
quality.actions[:exit] = [:quality] | |
record = cat( | |
'@', identifier1, opt(cat(rep1(hspace), description1)), newline, | |
sequence, rep1(newline), | |
'+', opt(cat(identifier2, opt(cat(rep1(hspace), description2)))), newline, | |
quality, rep1(newline)) | |
record.actions[:enter] = [:anchor] | |
record.actions[:exit] = [:record] | |
whitespace = re.space() | newline | |
fastq = cat(rep(whitespace), rep(record)) | |
return Automa.compile(fastq) | |
end)() | |
#= Debug =# | |
write("fastq.dot", Automa.dfa2dot(fastq_machine.dfa)) | |
run(`dot -Tsvg -o fastq.svg fastq.dot`) | |
type FASTQReader <: Bio.IO.AbstractReader | |
state::Bio.Ragel.State | |
function FASTQReader(input::BufferedInputStream) | |
return new(Bio.Ragel.State(fastq_machine.start_state, input)) | |
end | |
end | |
function FASTQReader(input::IO) | |
return FASTQReader(BufferedInputStream(input)) | |
end | |
type FASTQRecord | |
filled::Bool | |
data::Vector{UInt8} | |
identifier1::UnitRange{Int} | |
description1::UnitRange{Int} | |
sequence::UnitRange{Int} | |
identifier2::UnitRange{Int} | |
description2::UnitRange{Int} | |
quality::UnitRange{Int} | |
end | |
function FASTQRecord(data::Vector{UInt8}=UInt8[]) | |
record = FASTQRecord( | |
false, | |
data, | |
0:-1, | |
0:-1, | |
0:-1, | |
0:-1, | |
0:-1, | |
0:-1) | |
if !isempty(data) | |
# TODO | |
index!(record) | |
end | |
return record | |
end | |
function initialize!(record::FASTQRecord) | |
record.filled = false | |
record.identifier1 = 0:-1 | |
record.description1 = 0:-1 | |
record.sequence = 0:-1 | |
record.identifier2 = 0:-1 | |
record.description2 = 0:-1 | |
record.quality = 0:-1 | |
return record | |
end | |
function Base.show(io::IO, record::FASTQRecord) | |
print(io, summary(record), ':') | |
if record.filled | |
println(io) | |
println(io, " identifier: ", String(record.data[record.identifier1])) | |
println(io, " description: ", String(record.data[record.description1])) | |
println(io, " sequence: ", String(record.data[record.sequence])) | |
print(io, " quality: ", String(record.data[record.quality])) | |
else | |
print(io, " <not filled>") | |
end | |
end | |
function Base.read!(reader::FASTQReader, record::FASTQRecord) | |
return _read!(reader, reader.state, record) | |
end | |
@inline function anchor!(stream::BufferedInputStream, p) | |
stream.anchor = p | |
stream.immobilized = true | |
return stream | |
end | |
@inline function upanchor!(stream::BufferedInputStream) | |
@assert stream.anchor != 0 "upanchor! called with no anchor set" | |
anchor = stream.anchor | |
stream.anchor = 0 | |
stream.immobilized = false | |
return anchor | |
end | |
const fastq_actions = Dict( | |
:identifier1 => :(record.identifier1 = (1:p-mark)+copied; copied += resize_and_copy!(record.data, copied + 1, data, mark:p-1)), | |
:description1 => :(record.description1 = (1:p-mark)+copied; copied += resize_and_copy!(record.data, copied + 1, data, mark:p-1)), | |
:identifier2 => :(record.identifier2 = (1:p-mark)+copied; copied += resize_and_copy!(record.data, copied + 1, data, mark:p-1)), | |
:description2 => :(record.description2 = (1:p-mark)+copied; copied += resize_and_copy!(record.data, copied + 1, data, mark:p-1)), | |
:letters => :(copied += resize_and_copy!(record.data, copied + 1, data, mark:p-1)), | |
:sequence => :(record.sequence = mark2:copied), | |
:quality => :(record.quality = mark2:copied), | |
:record => :(found_record = true; @escape), | |
:countline => :(linenum += 1), | |
:anchor => :(anchor!(stream, p)), | |
:mark => :(mark = p), | |
:mark2 => :(mark2 = copied + 1)) | |
function resize_and_copy!(dst::Vector{UInt8}, dstart::Int, src::Vector{UInt8}, range::UnitRange{Int}) | |
n = length(range) | |
if length(dst) < dstart + n - 1 | |
resize!(dst, dstart + n - 1) | |
end | |
copy!(dst, dstart, src, first(range), n) | |
return n | |
end | |
@eval function _read!(reader::FASTQReader, state::Bio.Ragel.State, record::FASTQRecord) | |
stream = state.stream | |
if stream.position // length(stream.buffer) > 95 // 100 | |
BufferedStreams.shiftdata!(stream) | |
end | |
cs = state.cs | |
linenum = state.linenum | |
data = stream.buffer | |
p = stream.position | |
p_end = stream.available | |
p_eof = -1 | |
mark = mark2 = copied = 0 | |
found_record = false | |
initialize!(record) | |
anchor!(stream, p) | |
while true | |
$(Automa.generate_exec_code(fastq_machine, actions=fastq_actions, code=:goto, check=false)) | |
state.cs = cs | |
state.finished = cs == 0 | |
state.linenum = linenum | |
stream.position = p | |
if cs < 0 | |
error("FASTQ file format error on line ", linenum) | |
elseif found_record | |
upanchor!(stream) | |
record.filled = true | |
checkrecord(record) | |
break | |
elseif cs == 0 | |
throw(EOFError()) | |
elseif p > p_eof ≥ 0 | |
error("incomplete FASTQ input on line ", linenum) | |
else | |
hits_eof = BufferedStreams.fillbuffer!(stream) == 0 | |
p = stream.position | |
p_end = stream.available | |
if hits_eof | |
p_eof = p_end | |
end | |
end | |
end | |
return record | |
end | |
function checkrecord(record::FASTQRecord) | |
# TODO | |
end | |
function scan1(filepath) | |
reader = open(FASTQReader, filepath) | |
record = FASTQRecord() | |
try | |
while true | |
read!(reader, record) | |
#String(record.data[record.sequence]) | |
end | |
catch ex | |
if !isa(ex, EOFError) | |
rethrow() | |
end | |
end | |
end | |
function scan2(filepath) | |
reader = open(Bio.Seq.FASTQReader, filepath) | |
record = Bio.Seq.FASTQSeqRecord{Bio.Seq.DNASequence}() | |
try | |
while true | |
read!(reader, record) | |
end | |
catch ex | |
if !isa(ex, EOFError) | |
rethrow() | |
end | |
end | |
end | |
scan1("./test/BioFmtSpecimens/FASTQ/example.fastq") | |
@time scan1("test.fastq") | |
@time scan1("test.fastq") | |
@time scan1("test.fastq") | |
scan2("./test/BioFmtSpecimens/FASTQ/example.fastq") | |
@time scan2("test.fastq") | |
@time scan2("test.fastq") | |
@time scan2("test.fastq") | |
#= | |
~/.j/v/Bio (vcf|…) $ julia fastq.jl | |
1.263985 seconds (159 allocations: 138.297 KB) | |
1.280454 seconds (33 allocations: 130.734 KB) | |
1.313651 seconds (33 allocations: 130.734 KB) | |
6.440958 seconds (59 allocations: 4.006 MB) | |
6.491868 seconds (59 allocations: 4.006 MB) | |
7.357701 seconds (59 allocations: 4.006 MB) | |
=# |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment