Skip to content

Instantly share code, notes, and snippets.

@bicycle1885
Last active January 10, 2017 19:14
Show Gist options
  • Save bicycle1885/39e26a9810a0bf39601925352dc12281 to your computer and use it in GitHub Desktop.
Save bicycle1885/39e26a9810a0bf39601925352dc12281 to your computer and use it in GitHub Desktop.
Simple NEWICK tokenizer using Automa.jl
using Automa
using Automa.RegExp
const re = Automa.RegExp
length = re"[0-9]+\.[0-9]+"
name = re.rep1(re"[!-~]" \ re"[,:;()[\]]")
spaces = re" +"
const newick = compile(
re"\(" => :(emit(:lparen)),
re"\)" => :(emit(:rparen)),
re"," => :(emit(:comma)),
re":" => :(emit(:colon)),
re";" => :(emit(:semicolon)),
length => :(emit(:length)),
name => :(emit(:name)),
spaces => :(),
)
@eval function tokenize(data)
$(generate_init_code(newick))
p_end = p_eof = sizeof(data)
tokens = Tuple{Symbol,String}[]
emit(kind) = push!(tokens, (kind, data[ts:te]))
while p ≤ p_eof && cs > 0
$(generate_exec_code(newick))
end
if cs < 0
error("failed to tokenize")
end
return tokens
end
# Taken from: http://evolution.genetics.washington.edu/phylip/newicktree.html
tokenize("(,(,,),);")
tokenize("(B:6.0,(A:5.0,C:3.0,E:4.0):5.0,D:11.0);")
tokenize("(B:6.0,(A:5.0,C:3.0,E:4.0)Ancestor1:5.0,D:11.0);")
tokenize("((raccoon:19.19959,bear:6.80041):0.84600,((sea_lion:11.99700, seal:12.00300):7.52973,((monkey:100.85930,cat:47.14069):20.59201, weasel:18.87953):2.09460):3.87382,dog:25.46154);")
tokenize("(Bovine:0.69395,(Gibbon:0.36079,(Orang:0.33636,(Gorilla:0.17147,(Chimp:0.19268, Human:0.11927):0.08386):0.06124):0.15057):0.54939,Mouse:1.21460):0.10;")
tokenize("(Bovine:0.69395,(Hylobates:0.36079,(Pongo:0.33636,(G._Gorilla:0.17147, (P._paniscus:0.19268,H._sapiens:0.11927):0.08386):0.06124):0.15057):0.54939, Rodent:1.21460);")
tokenize("A;")
tokenize("((A,B),(C,D));")
tokenize("(Alpha,Beta,Gamma,Delta,,Epsilon,,,);")
#= Example
julia> tokenize("(B:6.0,(A:5.0,C:3.0,E:4.0)Ancestor1:5.0,D:11.0);")
27-element Array{Tuple{Symbol,String},1}:
(:lparen,"(")
(:name,"B")
(:colon,":")
(:length,"6.0")
(:comma,",")
(:lparen,"(")
(:name,"A")
(:colon,":")
(:length,"5.0")
(:comma,",")
(:name,"C")
(:colon,":")
(:length,"3.0")
(:comma,",")
(:name,"E")
(:colon,":")
(:length,"4.0")
(:rparen,")")
(:name,"Ancestor1")
(:colon,":")
(:length,"5.0")
(:comma,",")
(:name,"D")
(:colon,":")
(:length,"11.0")
(:rparen,")")
(:semicolon,";")
=#
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment