Skip to content

Instantly share code, notes, and snippets.

@bicycle1885
Created January 31, 2017 17:56
Show Gist options
  • Save bicycle1885/5080e87c9cb95fc2d6664864a9ef711f to your computer and use it in GitHub Desktop.
Save bicycle1885/5080e87c9cb95fc2d6664864a9ef711f to your computer and use it in GitHub Desktop.
import Automa
import Automa.RegExp: @re_str
import BufferedStreams
import BufferedStreams: BufferedInputStream
import Bio
const fastq_machine = (function ()
re = Automa.RegExp
cat = re.cat
rep = re.rep
rep1 = re.rep1
opt = re.opt
lf = re"\n"
lf.actions[:enter] = [:countline]
newline = cat(opt('\r'), lf)
hspace = re"[ \t\v]"
nonspace = re.any() \ re.space()
identifier1 = rep1(nonspace)
identifier1.actions[:enter] = [:mark]
identifier1.actions[:exit] = [:identifier1]
description1 = cat(nonspace, re"[^\r\n]*")
description1.actions[:enter] = [:mark]
description1.actions[:exit] = [:description1]
identifier2 = rep1(nonspace)
identifier2.actions[:enter] = [:mark]
identifier2.actions[:exit] = [:identifier2]
description2 = cat(nonspace, re"[^\r\n]*")
description2.actions[:enter] = [:mark]
description2.actions[:exit] = [:description2]
sequence = let
letters = re"[A-z]+"
letters.actions[:enter] = [:mark]
letters.actions[:exit] = [:letters]
cat(opt(letters), rep(cat(rep1(newline), letters)))
end
sequence.actions[:enter] = [:mark2]
sequence.actions[:exit] = [:sequence]
quality = let
qletters = re"[!-~]+"
qletters.actions[:enter] = [:mark]
qletters.actions[:exit] = [:letters]
cat(opt(qletters), rep(cat(rep1(newline), qletters)))
end
quality.actions[:enter] = [:mark2]
quality.actions[:exit] = [:quality]
record = cat(
'@', identifier1, opt(cat(rep1(hspace), description1)), newline,
sequence, rep1(newline),
'+', opt(cat(identifier2, opt(cat(rep1(hspace), description2)))), newline,
quality, rep1(newline))
record.actions[:enter] = [:anchor]
record.actions[:exit] = [:record]
whitespace = re.space() | newline
fastq = cat(rep(whitespace), rep(record))
return Automa.compile(fastq)
end)()
#= Debug =#
write("fastq.dot", Automa.dfa2dot(fastq_machine.dfa))
run(`dot -Tsvg -o fastq.svg fastq.dot`)
type FASTQReader <: Bio.IO.AbstractReader
state::Bio.Ragel.State
function FASTQReader(input::BufferedInputStream)
return new(Bio.Ragel.State(fastq_machine.start_state, input))
end
end
function FASTQReader(input::IO)
return FASTQReader(BufferedInputStream(input))
end
type FASTQRecord
filled::Bool
data::Vector{UInt8}
identifier1::UnitRange{Int}
description1::UnitRange{Int}
sequence::UnitRange{Int}
identifier2::UnitRange{Int}
description2::UnitRange{Int}
quality::UnitRange{Int}
end
function FASTQRecord(data::Vector{UInt8}=UInt8[])
record = FASTQRecord(
false,
data,
0:-1,
0:-1,
0:-1,
0:-1,
0:-1,
0:-1)
if !isempty(data)
# TODO
index!(record)
end
return record
end
function initialize!(record::FASTQRecord)
record.filled = false
record.identifier1 = 0:-1
record.description1 = 0:-1
record.sequence = 0:-1
record.identifier2 = 0:-1
record.description2 = 0:-1
record.quality = 0:-1
return record
end
function Base.show(io::IO, record::FASTQRecord)
print(io, summary(record), ':')
if record.filled
println(io)
println(io, " identifier: ", String(record.data[record.identifier1]))
println(io, " description: ", String(record.data[record.description1]))
println(io, " sequence: ", String(record.data[record.sequence]))
print(io, " quality: ", String(record.data[record.quality]))
else
print(io, " <not filled>")
end
end
function Base.read!(reader::FASTQReader, record::FASTQRecord)
return _read!(reader, reader.state, record)
end
@inline function anchor!(stream::BufferedInputStream, p)
stream.anchor = p
stream.immobilized = true
return stream
end
@inline function upanchor!(stream::BufferedInputStream)
@assert stream.anchor != 0 "upanchor! called with no anchor set"
anchor = stream.anchor
stream.anchor = 0
stream.immobilized = false
return anchor
end
const fastq_actions = Dict(
:identifier1 => :(record.identifier1 = (1:p-mark)+copied; copied += resize_and_copy!(record.data, copied + 1, data, mark:p-1)),
:description1 => :(record.description1 = (1:p-mark)+copied; copied += resize_and_copy!(record.data, copied + 1, data, mark:p-1)),
:identifier2 => :(record.identifier2 = (1:p-mark)+copied; copied += resize_and_copy!(record.data, copied + 1, data, mark:p-1)),
:description2 => :(record.description2 = (1:p-mark)+copied; copied += resize_and_copy!(record.data, copied + 1, data, mark:p-1)),
:letters => :(copied += resize_and_copy!(record.data, copied + 1, data, mark:p-1)),
:sequence => :(record.sequence = mark2:copied),
:quality => :(record.quality = mark2:copied),
:record => :(found_record = true; @escape),
:countline => :(linenum += 1),
:anchor => :(anchor!(stream, p)),
:mark => :(mark = p),
:mark2 => :(mark2 = copied + 1))
function resize_and_copy!(dst::Vector{UInt8}, dstart::Int, src::Vector{UInt8}, range::UnitRange{Int})
n = length(range)
if length(dst) < dstart + n - 1
resize!(dst, dstart + n - 1)
end
copy!(dst, dstart, src, first(range), n)
return n
end
@eval function _read!(reader::FASTQReader, state::Bio.Ragel.State, record::FASTQRecord)
stream = state.stream
if stream.position // length(stream.buffer) > 95 // 100
BufferedStreams.shiftdata!(stream)
end
cs = state.cs
linenum = state.linenum
data = stream.buffer
p = stream.position
p_end = stream.available
p_eof = -1
mark = mark2 = copied = 0
found_record = false
initialize!(record)
anchor!(stream, p)
while true
$(Automa.generate_exec_code(fastq_machine, actions=fastq_actions, code=:goto, check=false))
state.cs = cs
state.finished = cs == 0
state.linenum = linenum
stream.position = p
if cs < 0
error("FASTQ file format error on line ", linenum)
elseif found_record
upanchor!(stream)
record.filled = true
checkrecord(record)
break
elseif cs == 0
throw(EOFError())
elseif p > p_eof 0
error("incomplete FASTQ input on line ", linenum)
else
hits_eof = BufferedStreams.fillbuffer!(stream) == 0
p = stream.position
p_end = stream.available
if hits_eof
p_eof = p_end
end
end
end
return record
end
function checkrecord(record::FASTQRecord)
# TODO
end
function scan1(filepath)
reader = open(FASTQReader, filepath)
record = FASTQRecord()
try
while true
read!(reader, record)
#String(record.data[record.sequence])
end
catch ex
if !isa(ex, EOFError)
rethrow()
end
end
end
function scan2(filepath)
reader = open(Bio.Seq.FASTQReader, filepath)
record = Bio.Seq.FASTQSeqRecord{Bio.Seq.DNASequence}()
try
while true
read!(reader, record)
end
catch ex
if !isa(ex, EOFError)
rethrow()
end
end
end
scan1("./test/BioFmtSpecimens/FASTQ/example.fastq")
@time scan1("test.fastq")
@time scan1("test.fastq")
@time scan1("test.fastq")
scan2("./test/BioFmtSpecimens/FASTQ/example.fastq")
@time scan2("test.fastq")
@time scan2("test.fastq")
@time scan2("test.fastq")
#=
~/.j/v/Bio (vcf|…) $ julia fastq.jl
1.263985 seconds (159 allocations: 138.297 KB)
1.280454 seconds (33 allocations: 130.734 KB)
1.313651 seconds (33 allocations: 130.734 KB)
6.440958 seconds (59 allocations: 4.006 MB)
6.491868 seconds (59 allocations: 4.006 MB)
7.357701 seconds (59 allocations: 4.006 MB)
=#
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment