Last active
October 12, 2019 21:18
-
-
Save peteroupc/177d89b387b609daaa44e0d498f46181 to your computer and use it in GitHub Desktop.
ABNF parser and reader
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class TokenSymbolParser | |
def initialize(token,prod) | |
@token=token | |
@prod=prod | |
end | |
def parse(str) | |
return _parseOne(@token,str,0,str.length) | |
end | |
private | |
def _parseOne(token,str,index,endIndex) | |
minOccurs=[token.minOccurs,0].max | |
maxOccurs=token.maxOccurs | |
count=0 | |
while maxOccurs<0 || count<maxOccurs | |
idx=_parseOneInst(token,str,index,endIndex) | |
if idx<0 | |
return -1 if count<minOccurs | |
return index | |
end | |
count+=1 | |
index=idx | |
end | |
return index | |
end | |
def _parseOneInst(token,str,index,endIndex) | |
case token.kind | |
when :HEX | |
return index<endIndex && str[index].ord==token.value ? index+1 : -1 | |
when :HEXRANGE | |
if index<endIndex && str[index].ord>=token.value[0] && str[index].ord<=token.value[1] | |
return index+1 | |
end | |
return -1 | |
when :WORD | |
return _parseOne(@prod[token.value],str,index,endIndex) | |
when :STRING | |
token.value.each_char{|x| | |
return -1 if index>=endIndex | |
iord=str[index].ord | |
xord=x[0].ord | |
index+=1 | |
if (xord>=0x41 && xord<=0x5a) | |
return -1 if iord!=xord && iord!=xord+0x20 | |
elsif (xord>=0x61 && xord<=0x7a) | |
return -1 if iord!=xord && iord!=xord-0x20 | |
else | |
return -1 if iord!=xord | |
end | |
} | |
return index | |
when :SEQUENCE | |
token.each{|tok| | |
idx=_parseOne(tok,str,index,endIndex) | |
return -1 if idx<0 | |
index=idx | |
} | |
return index | |
when :CHOICE | |
token.each{|tok| | |
idx=_parseOne(tok,str,index,endIndex) | |
return idx if idx>=0 | |
} | |
return -1 | |
end | |
end | |
end | |
# | |
# Represents a single node in an ABNF grammar. | |
# | |
class TokenSymbol | |
include Enumerable | |
attr_accessor :kind | |
attr_accessor :value | |
attr_reader :tokens | |
attr_accessor :parent | |
attr_accessor :minOccurs | |
attr_accessor :name | |
attr_accessor :maxOccurs | |
def initialize(kind=nil,value=nil) | |
@tokens=[] | |
@kind=kind | |
@value=value | |
@minOccurs=1 | |
@maxOccurs=1 | |
end | |
def each | |
@tokens.each {|item| yield item } | |
end | |
def isDelegate? | |
return true if self.kind==:SEQUENCE && | |
self.minOccurs==1 && self.maxOccurs==1 && | |
@tokens.length==1 && @tokens[0].kind==:WORD && | |
@tokens[0].minOccurs==1 && @tokens[0].maxOccurs==1 | |
return true if self.kind==:WORD && | |
self.minOccurs==1 && self.maxOccurs==1 | |
return false | |
end | |
def selfOrLast | |
if @kind==:CHOICE | |
return @tokens[@tokens.length-1] | |
else | |
return self | |
end | |
end | |
def length | |
return @tokens.length | |
end | |
def [](idx) | |
return @tokens[idx] | |
end | |
def minOccurs=(val) | |
val=-1 if val<=0 | |
@minOccurs=val | |
end | |
def maxOccurs=(val) | |
val=-1 if val<=0 | |
@maxOccurs=val | |
end | |
def setMinMax(mn,mx) | |
self.minOccurs=mn | |
self.maxOccurs=mx | |
return self | |
end | |
def clear | |
@tokens.clear | |
end | |
def copy | |
ret=TokenSymbol.new(@kind,@value) | |
ret.minOccurs=@minOccurs | |
ret.maxOccurs=@maxOccurs | |
for tok in @tokens | |
ret.addRaw(tok) | |
end | |
return ret | |
end | |
def add(tok) | |
sym=TokenSymbol.new(tok[0],tok[1]) | |
sym.parent=self | |
@tokens.push(sym) | |
return sym | |
end | |
def addChild(child) | |
child.parent=self | |
@tokens.push(child) | |
return child | |
end | |
def addRaw(child) | |
@tokens.push(child) | |
return child | |
end | |
def insertChild(child,index) | |
child.parent=self | |
@tokens.insert(index,child) | |
return child | |
end | |
def inspectSimple | |
return "#{kind} #{value||name} #{minOccurs} #{maxOccurs}" | |
end | |
def inspectDepth(depth) | |
ret="" | |
for i in 0...depth | |
ret+=" " | |
end | |
ret+="#{kind} #{value||name} #{minOccurs} #{maxOccurs}\n" | |
for tok in @tokens | |
ret+=(tok ? tok.inspectDepth(depth+1) : (" "*(depth+1))+"?\n") | |
end | |
return ret | |
end | |
def toNamedStr | |
ret=to_s | |
ret2=ret.gsub(/^\(|\)$/,"") | |
ret=ret2 if !ret2.index("(") && !ret2.index(")") | |
return ret if !name | |
return name+" = "+ret | |
end | |
def minLength | |
return 0 if self.minOccurs<=0 | |
case @kind | |
when :STRING, :HEX, :HEXRANGE | |
return self.hexLength*self.minOccurs | |
when :SEQUENCE | |
if @tokens.length==1 | |
return @tokens[0].minLength*self.minOccurs | |
end | |
return self.choiceLength | |
when :CHOICE | |
if @tokens.length==1 | |
return @tokens[0].minLength*self.minOccurs | |
end | |
return self.choiceLength | |
when :WORD | |
raise "Not supported" | |
end | |
return 0 | |
end | |
def to_s | |
extra="" | |
ret="" | |
optional=false | |
if minOccurs<=0 | |
if maxOccurs==1 | |
optional=true | |
else | |
extra="*"+(maxOccurs<0 ? "" : maxOccurs.to_s) | |
end | |
elsif maxOccurs<=0 | |
extra=minOccurs.to_s+"*" | |
elsif minOccurs==0 && maxOccurs==1 | |
optional=true | |
elsif minOccurs!=maxOccurs | |
extra=minOccurs.to_s+"*"+maxOccurs.to_s | |
elsif minOccurs!=1 | |
extra=minOccurs.to_s | |
end | |
case @kind | |
when :STRING | |
if value[/[\r\n]/] | |
rr=[] | |
for b in value.bytes | |
rr.push(sprintf("%%x%02x",b)) | |
end | |
return rr.join(" ") | |
else | |
data=value.split("\"") | |
for i in 0...data.length | |
data[i]="\""+data[i]+"\"" | |
end | |
ret=data.join(" %x22 ") | |
end | |
when :WORD | |
ret=@value | |
when :HEXRANGE | |
ret=sprintf("%%x%02x-%02x",@value[0],@value[1]) | |
when :HEX | |
ret=sprintf("%%x%02x",@value) | |
when :SEQUENCE | |
ret=((optional || @tokens.length==1) ? "" : "(")[email protected]{|t| t.to_s }.join(" ")+ | |
((optional || @tokens.length==1) ? "" : ")") | |
when :CHOICE | |
ret=((optional || @tokens.length==1) ? "" : "(")[email protected]{|t| t.to_s }.join(" / ")+ | |
((optional || @tokens.length==1) ? "" : ")") | |
end | |
if optional | |
return "["+ret+"]" | |
else | |
return extra+ret | |
end | |
end | |
private | |
def _appendUtf8(str,codepoint) | |
return str if codepoint<0 | |
if codepoint<=0x7F | |
str+=codepoint.chr | |
elsif codepoint<=0x7FF | |
str+=(0xC0|((codepoint>>6)&0x1F)).chr | |
str+=(0x80|(codepoint &0x3F)).chr | |
elsif codepoint<=0xFFFF | |
str+=(0xE0|((codepoint>>12)&0x0F)).chr | |
str+=(0x80|((codepoint>>6)&0x3F)).chr | |
str+=(0x80|(codepoint &0x3F)).chr | |
elsif codepoint<=0x10FFFF | |
str+=(0xF0|((codepoint>>18)&0x07)).chr | |
str+=(0x80|((codepoint>>12)&0x3F)).chr | |
str+=(0x80|((codepoint>>6)&0x3F)).chr | |
str+=(0x80|(codepoint &0x3F)).chr | |
end | |
return str | |
end | |
public | |
# Length of a string token | |
def stringLength | |
return (self.kind==:STRING) ? self.value.length : 0 | |
end | |
# Minimum length of a hex token | |
def hexLength | |
return self.value.length if self.kind==:STRING | |
return self.minOccurs if (self.kind==:HEX || self.kind==:HEXRANGE) | |
return 0 | |
end | |
# Recursively calculates the minimum length of a choice token | |
def choiceLength | |
if self.kind==:CHOICE | |
return self.map{|tok| tok.hexLength()*self.minOccurs }.min | |
end | |
if self.kind==:SEQUENCE && self.length==2 | |
h=self[0].hexLength() | |
if self[1].kind==:HEX || self[1].kind==:HEXRANGE || self[1].kind==:STRING | |
h+=self[1].hexLength() | |
end | |
if self[1].kind==:CHOICE | |
h+=self[1].choiceLength() | |
end | |
return h*self.minOccurs | |
end | |
return 0 | |
end | |
def isNonWhitespace? | |
if self.kind==:HEX | |
return @value!=0x20 && @value!=0x09 && | |
@value!=0x0a && @value!=0x0d | |
elsif self.kind==:STRING | |
for b in @value.bytes | |
return true if b!=0x20 && b!=0x09 && | |
b!=0x0a && b!=0x0d | |
end | |
return false | |
elsif self.kind==:HEXRANGE | |
return true if @value[0]>0x20 | |
chars=[] | |
c=@value[0];while c<=[0x21,@value[1]].min | |
chars.push(c) | |
c+=1;end | |
chars.delete(0x20) | |
chars.delete(0x09) | |
chars.delete(0x0a) | |
chars.delete(0x0d) | |
return chars.length>0 | |
end | |
return false | |
end | |
def productionContainsWhitespaceOnly?(prod) | |
s=self | |
while s.kind==:WORD | |
raise "Can't find production '#{s.value}'" if !prod[s.value] | |
s=prod[s.value] | |
end | |
if s.kind==:CHOICE || s.kind==:SEQUENCE | |
s.each{|t| | |
return false if !t.productionContainsWhitespaceOnly?(prod) | |
} | |
return true | |
end | |
return !s.isNonWhitespace? | |
end | |
def isMatch?(str,productions) | |
parser=TokenSymbolParser.new(self,productions) | |
return parser.parse(str)==str.length | |
end | |
# | |
# Generates a random UTF-8 string that fits this node's grammar. | |
# The 'productions' parameter is a hash of productions (nodes) to be used if | |
# this node refers to another ABNF production (node) by name, either | |
# directly or indirectly. | |
# | |
def randomString(productions, stack=nil) | |
isws=self.productionContainsWhitespaceOnly?(productions) | |
mn=@minOccurs<=0 ? 0 : @minOccurs | |
mx=@maxOccurs<0 ? ((1+rand(12))+(1+rand(12)))/2 : @maxOccurs | |
if mn<=0 && mx==1 && rand(100)<(isws ? 90 : 10) | |
# Optional production | |
return "" | |
end | |
if mn<=0 && mx > 0 | |
mn=1 | |
end | |
stack=[] if !stack | |
occur=(mx==mn) ? mn : mn+rand(mx-mn+1) | |
if isws | |
occur/=2 | |
occur=mn if occur<mn | |
end | |
str="" | |
if mn==0 && isws && rand(100)<80 | |
occur=0 | |
end | |
if mn==1 && isws && rand(100)<80 | |
occur=0 | |
end | |
for i in 0...occur | |
case @kind | |
when :STRING | |
for b in self.value.bytes | |
if (b>=0x41 && b<=0x5a) || (b>=0x61 && b<=0x7a) | |
newletter=rand(2)==0 ? (b|0x20) : (b&0xDF) | |
raise if !((newletter>=0x41 && newletter<=0x5a) || | |
(newletter>=0x61 && newletter<=0x7a)) | |
str=_appendUtf8(str,newletter); | |
else | |
str=_appendUtf8(str,b) | |
end | |
end | |
when :WORD | |
stack.push(@value) | |
pr=productions[@value] | |
raise "Can't find production '#{@value}'" if !pr | |
str+=pr.randomString(productions, stack) | |
stack.pop | |
when :HEX | |
str=_appendUtf8(str,@value) | |
when :HEXRANGE | |
includesUpperLowerA=@value[0]<=0x41 && @value[1]>=0x41 && | |
@value[0]<=0x61 && @value[1]>=0x61 | |
rnd=0 | |
while true | |
rnd=@value[0]==@value[1] ? @value[0] : | |
@value[0]+rand(@value[1]-@value[0]+1) | |
if includesUpperLowerA && !(rnd>=0x41 && rnd<=0x5a) && | |
!(rnd>=0x61 && rnd<=0x7a) && | |
!(rnd>=0x30 && rnd<=0x39) && | |
rand(100)<90 | |
next | |
end | |
if rnd<0x20 && rand(100)<90 | |
next | |
end | |
break | |
end | |
str+=_appendUtf8(str,rnd) | |
when :SEQUENCE | |
self.each{|t| | |
str+=t.randomString(productions, stack) | |
} | |
when :CHOICE | |
while true | |
t=@tokens[rand(@tokens.length)] | |
if t.productionContainsWhitespaceOnly?(productions) && | |
rand(100)<95 | |
next | |
end | |
if t.kind==:WORD | |
recursing=stack.include?(t.value) | |
if recursing && (stack.length>50 || rand(100)<90) | |
next | |
end | |
end | |
str+=t.randomString(productions, stack) | |
break | |
end | |
end | |
end | |
return str | |
end | |
def allHexChoice | |
return false if @kind!=:CHOICE | |
for tok in @tokens | |
return false if tok.kind!=:HEX && tok.kind!=:HEXRANGE | |
end | |
return true | |
end | |
def isAsciiLetter(x) | |
return (x>=0x41 && x<=0x5a) || (x>=0x61 && x<=0x7a) | |
end | |
def allHexChoiceRecursiveAnyOccurs | |
return false if @kind!=:CHOICE | |
for tok in @tokens | |
return false if tok.kind!=:HEX && tok.kind!=:HEXRANGE && !tok.allHexChoiceRecursiveAnyOccurs() | |
end | |
return true | |
end | |
def _hexChoiceTokens(hextokens) | |
raise "Internal error" if @kind!=:CHOICE | |
for tok in @tokens | |
raise "Internal error" if tok.minOccurs!=1 || tok.maxOccurs!=1 | |
if tok.kind==:CHOICE | |
tok._hexChoiceTokens(hextokens) | |
elsif tok.kind==:HEX || tok.kind==:HEXRANGE | |
hextokens.push(tok) | |
else | |
raise "Internal error" | |
end | |
end | |
end | |
def allHexChoiceRecursive | |
return false if @kind!=:CHOICE | |
for tok in @tokens | |
return false if tok.minOccurs!=1 || tok.maxOccurs!=1 | |
return false if tok.kind!=:HEX && tok.kind!=:HEXRANGE && !tok.allHexChoiceRecursive() | |
end | |
return true | |
end | |
# Simplifies an ABNF tree | |
def simplify | |
return false if @kind==:HEX ||@kind==:HEXRANGE | |
changed=true | |
iter=0 | |
while changed | |
changed=false | |
iter+=1; raise self.inspect if iter>10 | |
if @kind==:SEQUENCE && @tokens.length>1 | |
i=0;while i<@tokens.length | |
tok=@tokens[i] | |
if tok.kind==:SEQUENCE && tok.minOccurs==1 && tok.maxOccurs==1 | |
newtokens=@tokens[0,i] | |
endtokens=@tokens[i+1,@tokens.length] | |
self.clear | |
for t in newtokens;self.addChild(t);end | |
for j in 0...tok.length; self.addChild(tok[j]); end | |
for t in endtokens;self.addChild(t);end | |
i-=1 | |
changed=true | |
end | |
i+=1 | |
end | |
end | |
if @kind==:SEQUENCE && @tokens.length==1 | |
tok=@tokens[0] | |
if (tok.kind==:WORD || tok.kind==:STRING || tok.kind==:HEX || tok.kind==:HEXRANGE) && | |
((tok.minOccurs==1 && tok.maxOccurs==1) || (self.minOccurs==1 && self.maxOccurs==1)) | |
self.kind=tok.kind | |
self.value=tok.value | |
if (self.minOccurs==1 && self.maxOccurs==1) | |
self.minOccurs=tok.minOccurs | |
self.maxOccurs=tok.maxOccurs | |
end | |
self.name=tok.name | |
self.clear | |
changed=true | |
elsif (tok.kind==:CHOICE || tok.kind==:SEQUENCE) && | |
((tok.minOccurs==1 && tok.maxOccurs==1) || (self.minOccurs==1 && self.maxOccurs==1)) | |
self.clear | |
for i in 0...tok.length | |
self.addChild(tok[i]) | |
end | |
self.kind=tok.kind | |
self.value=tok.value | |
if (self.minOccurs==1 && self.maxOccurs==1) | |
self.minOccurs=tok.minOccurs | |
self.maxOccurs=tok.maxOccurs | |
end | |
self.name=tok.name | |
changed=true | |
elsif (tok.kind==:CHOICE || tok.kind==:SEQUENCE) && | |
self.minOccurs==0 && self.maxOccurs==1 && tok.minOccurs==1 && tok.maxOccurs==-1 | |
self.clear | |
for i in 0...tok.length | |
self.addChild(tok[i]) | |
end | |
self.kind=tok.kind | |
self.value=tok.value | |
self.minOccurs=0 | |
self.maxOccurs=-1 | |
self.name=tok.name | |
changed=true | |
end | |
end | |
if @kind==:CHOICE | |
done=false | |
while !done | |
done=true | |
# isHexRangeRec = Choice of hex values and/or ranges of hex | |
# values, each of which can occur only once | |
isHexRangeRec = self.allHexChoiceRecursive() | |
if isHexRangeRec | |
hextokens=[] | |
self._hexChoiceTokens(hextokens) | |
@tokens=hextokens | |
end | |
i=0;while i<@tokens.length | |
tok=@tokens[i] | |
if !tok | |
i+=1; next | |
end | |
if (tok.kind==:HEX || tok.kind==:HEXRANGE) && !isHexRangeRec | |
# Simplify runs of hex/hexrange | |
runsize=1 | |
[email protected] | |
j=i+1;while j<@tokens.length | |
next if @tokens[j]==nil | |
if (@tokens[j].kind!=:HEX && @tokens[j].kind!=:HEXRANGE) | |
tokenend=j;break | |
end | |
runsize+=1 | |
j+=1;end | |
if runsize>1 && runsize<@tokens.length | |
tempchoice=TokenSymbol.new(:CHOICE) | |
for k in 0...runsize | |
tempchoice.addRaw(@tokens[i+k]) | |
end | |
tempchoice.simplify | |
newrunsize=runsize | |
if tempchoice.kind==:CHOICE && tempchoice.tokens.length<runsize | |
for k in 0...tempchoice.tokens.length | |
@tokens[i+k]=tempchoice[k] | |
end | |
newrunsize=tempchoice.tokens.length; done=false; changed=true | |
for k in (i+newrunsize)...tokenend | |
@tokens[k]=nil | |
end | |
elsif tempchoice.kind==:HEX || tempchoice.kind==:HEXRANGE | |
@tokens[i]=tempchoice | |
newrunsize=1; done=false; changed=true | |
for k in (i+newrunsize)...tokenend | |
@tokens[k]=nil | |
end | |
end | |
tok=@tokens[i] | |
end | |
i=tokenend | |
else | |
i+=1 | |
end | |
end | |
@tokens.compact! | |
i=0;while i<@tokens.length | |
tok=@tokens[i] | |
if !isHexRangeRec && !(tok.kind==:HEX || tok.kind==:HEXRANGE) | |
# don't combine past non-hex/hexrange | |
break | |
end | |
if (tok.kind==:HEX || tok.kind==:HEXRANGE) | |
found=false | |
tokMin=(tok.kind==:HEX) ? tok.value : tok.value[0] | |
tokMax=(tok.kind==:HEX) ? tok.value : tok.value[1] | |
j=i+1;while j<@tokens.length | |
tok2=@tokens[j] | |
j+=1 | |
next if !tok2 || tok==tok2 | |
if !isHexRangeRec && !(tok.kind==:HEX || tok.kind==:HEXRANGE) | |
# don't combine past non-hex/hexrange | |
break | |
end | |
if tok2.kind==:HEXRANGE && ( | |
(isHexRangeRec && tok2.minOccurs==tok.minOccurs && tok2.maxOccurs==tok.maxOccurs) || | |
(tok2.minOccurs==1 && tok.minOccurs==1 && tok2.maxOccurs==1 && tok.maxOccurs==1)) | |
if tok2.value[0]==tokMax+1 | |
tok2.value[0]=tokMin | |
tok2.name=nil | |
changed=true | |
found=true; break | |
elsif tok2.value[1]==tokMin-1 | |
tok2.value[1]=tokMax | |
tok2.name=nil | |
changed=true | |
found=true; break | |
end | |
elsif tok2.kind==:HEX && ( | |
(isHexRangeRec && tok2.minOccurs==tok.minOccurs && | |
tok2.maxOccurs==tok.maxOccurs) || | |
(tok2.minOccurs==1 && tok.minOccurs==1 && | |
tok2.maxOccurs==1 && tok.maxOccurs==1)) | |
if tok2.value==tokMax+1 | |
tok2.kind=:HEXRANGE | |
tok2.value=[tokMin,tok2.value] | |
tok2.name=nil | |
changed=true | |
found=true; break | |
elsif tok2.value==tokMin-1 | |
tok2.kind=:HEXRANGE | |
tok2.value=[tok2.value,tokMax] | |
changed=true | |
tok2.name=nil | |
found=true; break | |
end | |
end | |
end | |
if found | |
@tokens.delete_at(i) | |
i-=1 | |
done=false | |
end | |
end | |
i+=1 | |
end | |
end | |
end | |
if @kind==:CHOICE && ((self.minOccurs==1 && self.maxOccurs==1) || | |
self.allHexChoiceRecursive()) | |
done=false | |
while !done | |
done=true | |
i=0;while i<@tokens.length | |
tok=@tokens[i] | |
if tok.kind==:CHOICE && tok.length>1 && tok.minOccurs==1 && | |
tok.maxOccurs==1 | |
oldPosition=i | |
@tokens.delete_at(oldPosition) | |
i-=1 | |
changed=true | |
done=false | |
for j in 0...tok.length | |
self.insertChild(tok[tok.length-1-j],oldPosition) | |
end | |
end | |
i+=1 | |
end | |
end | |
end | |
if @kind==:STRING && @value.length==1 && !value[/[A-Za-z]/] | |
self.kind=:HEX | |
[email protected][0] | |
changed=true | |
end | |
if @kind==:SEQUENCE | |
for i in [email protected] | |
tok=@tokens[i] | |
nexttok=@tokens[i+1] | |
if nexttok && tok && tok.minOccurs==1 && tok.maxOccurs==1 && | |
nexttok.minOccurs==1 && nexttok.maxOccurs==1 | |
if tok.kind==:STRING && nexttok.kind==:STRING | |
tok.value+=nexttok.value | |
changed=true | |
@tokens[i+1]=nil | |
next | |
elsif tok.kind==:HEX && nexttok.kind==:HEX && | |
!self.isAsciiLetter(tok.value) && | |
!self.isAsciiLetter(nexttok.value) | |
tok.value=tok.value.chr | |
tok.value+=nexttok.value.chr | |
tok.kind=:STRING | |
changed=true | |
@tokens[i+1]=nil | |
next | |
end | |
end | |
end | |
end | |
@tokens.compact! | |
if (@kind==:CHOICE || @kind==:SEQUENCE) && @tokens.length==1 && | |
@tokens[0].minOccurs==1 && | |
@tokens[0].maxOccurs==1 | |
self.kind=@tokens[0].kind | |
self.value=@tokens[0].value | |
@name=@tokens[0].name if !@name || @name.length==0 | |
@tokens=@tokens[0].tokens | |
changed=true | |
end | |
for tok in @tokens | |
if tok.kind!=:HEX && tok.kind!=:HEXRANGE | |
oldkind=tok.kind | |
oldtok=tok.inspect | |
tok.simplify | |
if oldkind!=tok.kind | |
changed=true | |
else | |
changed|=(oldtok!=tok.inspect) | |
end | |
end | |
end | |
end | |
return self | |
end | |
def inspect | |
inspectDepth(0) | |
end | |
end | |
class ABNF | |
attr_reader :indexedTokens | |
attr_reader :maxLengthTokens | |
def initialize | |
@indexedTokens={} | |
@maxLengthTokens={} | |
end | |
# | |
# Parses a string containing ABNF productions. | |
# Returns a hash whose keys are the names and the | |
# values are ABNF grammar trees. There are six kinds of tokens: | |
# :SEQUENCE - matches all of the grammar nodes in order | |
# :CHOICE - matches one of the grammar nodes, to be | |
# checked in order | |
# :HEX - matches a single byte or character | |
# :HEXRANGE - matches a single byte or character within | |
# a range of values | |
# :STRING - matches an ASCII case-insensitive string | |
# :WORD - refers to another ABNF production | |
# | |
def parse(abnf) | |
abnflist={} | |
for a in splitAbnfLines(abnf) | |
parseAbnf(a,abnflist) | |
end | |
return abnflist | |
end | |
private | |
def isTokenStart(tok) | |
return false if !tok | |
case tok[0] | |
when :WORD,:STRING,:HEX,:HEXRANGE,:PARENOPEN | |
return true | |
else | |
return false | |
end | |
end | |
def parseAbnf(abnf,abnflist) | |
abnftest=[abnf] | |
tokens=[] | |
while true | |
token=nextToken(abnftest) | |
raise "Invalid ABNF: "+abnf if !token | |
break if token[0]==:END | |
tokens.push(token) | |
end | |
index=0 | |
while index<tokens.length | |
if tokens[index][0]==:TOKENVALUE | |
@indexedTokens[tokens[index][1][0]]=tokens[index][1][1] | |
index += 1 | |
elsif tokens[index][0]==:MAXLENGTH | |
@maxLengthTokens[tokens[index][1][0]]=tokens[index][1][1] | |
index += 1 | |
else | |
break | |
end | |
end | |
if tokens.length-index>2 | |
if tokens[index][0]==:WORD && tokens[index+1][0]==:EQUALS | |
name=tokens[index][1] | |
root=TokenSymbol.new(:SEQUENCE) | |
node=root | |
nextMinOccurs=nil | |
nextMaxOccurs=nil | |
for i in index+2...tokens.length | |
tok=tokens[i] | |
# Includes internal token :CSSTRING, for case-sensitive strings, | |
# converting them to :SEQUENCE tokens | |
case tok[0] | |
when :WORD, :STRING, :CSSTRING, :HEX, :HEXRANGE | |
if nextMinOccurs | |
if tok[0]==:STRING && tok[1].length==1 && tok[1][/[A-Za-z]/] | |
newtok=node.add([:CHOICE,nil]) | |
newtok.add([:HEX,tok[1].upcase.bytes[0]]) | |
newtok.add([:HEX,tok[1].downcase.bytes[0]]) | |
newtok.setMinMax(nextMinOccurs,nextMaxOccurs) | |
nextMinOccurs=nil | |
else | |
newtok=nil | |
if tok[0]==:CSSTRING | |
newtok=node.add([:SEQUENCE,nil]) | |
for b in tok[1].bytes | |
newtok.add([:HEX,b]) | |
end | |
p newtok | |
else | |
newtok=node.add(tok) | |
end | |
newtok.setMinMax(nextMinOccurs,nextMaxOccurs) | |
nextMinOccurs=nil | |
end | |
else | |
if tok[0]==:STRING && tok[1].length==1 && tok[1][/[A-Za-z]/] | |
newtok=node.add([:CHOICE,nil]) | |
newtok.add([:HEX,tok[1].upcase.bytes[0]]) | |
newtok.add([:HEX,tok[1].downcase.bytes[0]]) | |
else | |
newtok=nil | |
if tok[0]==:CSSTRING | |
newtok=node.add([:SEQUENCE,nil]) | |
for b in tok[1].bytes | |
newtok.add([:HEX,b]) | |
end | |
else | |
node.add(tok) | |
end | |
end | |
end | |
when :SLASH | |
if node.kind==:SEQUENCE | |
if node.parent && node.parent.kind==:CHOICE | |
node=node.parent.addChild(TokenSymbol.new(:SEQUENCE)) | |
else | |
node.kind=:CHOICE | |
node2=TokenSymbol.new(:SEQUENCE) | |
for i in 0...node.length | |
node2.addChild(node[i]) | |
end | |
node.clear | |
node.addChild(node2) | |
node=node.addChild(TokenSymbol.new(:SEQUENCE)) | |
end | |
else | |
node=node.addChild(TokenSymbol.new(:SEQUENCE)) | |
end | |
when :PARENOPEN | |
node=node.addChild(TokenSymbol.new(:SEQUENCE)) | |
if nextMinOccurs | |
node.setMinMax(nextMinOccurs,nextMaxOccurs) | |
nextMinOccurs=nil | |
end | |
when :PARENCLOSE, :BRACKETCLOSE | |
node=node.parent | |
if node && node.kind==:CHOICE | |
node=node.parent | |
end | |
when :BRACKETOPEN | |
node=node.addChild(TokenSymbol.new(:SEQUENCE)) | |
node.minOccurs=0 | |
when :ATLEASTNUMTIMES | |
raise "Invalid ABNF" if !isTokenStart(tokens[i+1]) | |
nextMinOccurs=tokens[i][1] | |
nextMaxOccurs=-1 | |
when :NUMTIMES | |
raise "Invalid ABNF" if !isTokenStart(tokens[i+1]) | |
nextMinOccurs=tokens[i][1] | |
nextMaxOccurs=tokens[i][1] | |
when :MINMAXTIMES | |
raise "Invalid ABNF" if !isTokenStart(tokens[i+1]) | |
nextMinOccurs=tokens[i][1][0] | |
nextMaxOccurs=tokens[i][1][1] | |
when :ANYNUMTIMES | |
raise "Invalid ABNF" if !isTokenStart(tokens[i+1]) | |
nextMinOccurs=-1 | |
nextMaxOccurs=-1 | |
when :TOKENVALUE | |
@indexedTokens[tokens[i][1][0]]=tokens[i][1][1] | |
when :MAXLENGTH | |
@maxLengthTokens[tokens[i][1][0]]=tokens[i][1][1] | |
else | |
raise "Invalid ABNF: "+abnf | |
end | |
end | |
root.simplify | |
root.name=name | |
if abnflist[name] | |
p "Warning: "+name+" defined already" | |
end | |
abnflist[name]=root | |
end | |
end | |
end | |
def chop(str,y) | |
return str[y.length,str.length] | |
end | |
def nextToken(xa) | |
x=xa[0] | |
x=x.gsub(/\A\s+/,"") | |
if x.length==0 | |
return [:END,nil] | |
end | |
if x[/\A;(.*)/] | |
# NOTE: This is an extension of ABNF, in | |
# order to allow numbers to be assigned | |
# to ABNF productions | |
xa[0]=chop(x,$&) | |
cmt=$1 | |
if cmt[/\A\s*TOKEN\:\s*(\S+)\s+(\d+)/] | |
return [:TOKENVALUE,[$1,$2.to_i]] | |
end | |
# NOTE: This is an extension of ABNF, in | |
# order to assign a maximum string length for | |
# ABNF productions | |
if cmt[/\A\s*MAXLENGTH\:\s*(\S+)\s+(\d+)/] | |
return [:MAXLENGTH,[$1,$2.to_i]] | |
end | |
# comment | |
return nextToken(xa) | |
end | |
if x[/\A\(\s*/] | |
xa[0]=chop(x,$&) | |
return [:PARENOPEN,nil] | |
elsif x[/\A\)\s*/] | |
xa[0]=chop(x,$&) | |
return [:PARENCLOSE,nil] | |
elsif x[/\A\[\s*/] | |
xa[0]=chop(x,$&) | |
return [:BRACKETOPEN,nil] | |
elsif x[/\A\]\s*/] | |
xa[0]=chop(x,$&) | |
return [:BRACKETCLOSE,nil] | |
elsif x[/\A\/\s*/] | |
xa[0]=chop(x,$&) | |
return [:SLASH,nil] | |
elsif x[/\A([0-9]+)\*([0-9]+)/] | |
xa[0]=chop(x,$&) | |
return [:MINMAXTIMES,[$1.to_i,$2.to_i]] | |
elsif x[/\A\*([0-9]+)/] | |
xa[0]=chop(x,$&) | |
return [:MINMAXTIMES,[0,$1.to_i]] | |
elsif x[/\A([0-9]+)\*/] | |
xa[0]=chop(x,$&) | |
return [:ATLEASTNUMTIMES,$&.to_i] | |
elsif x[/\A\*/] | |
xa[0]=chop(x,$&) | |
return [:ANYNUMTIMES,nil] | |
elsif x[/\A<\">/] | |
xa[0]=chop(x,$&) | |
return [:STRING,"\""] | |
elsif x[/\A(?:\%i)?\"([^\"]+)\"/] | |
# Case-insensitive string | |
xa[0]=chop(x,$&) | |
return [:STRING,$1] | |
elsif x[/\A\%s\"([^\"]+)\"/] | |
# Case-sensitive string | |
xa[0]=chop(x,$&) | |
return [:CSSTRING,$1] | |
elsif x[/\A\%x([0-9A-Fa-f]+)-([0-9A-Fa-f]+)/] | |
xa[0]=chop(x,$&) | |
return [:HEXRANGE,[$1.to_i(16),$2.to_i(16)]] | |
elsif x[/\A\%x([0-9A-Fa-f]+)/] | |
xa[0]=chop(x,$&) | |
hexval=$1.to_i(16) | |
x=xa[0] | |
if x[/\A\.([0-9A-Fa-f]+)/] | |
str=hexval.chr | |
while x[/\A\.([0-9A-Fa-f]+)/] | |
xa[0]=chop(x,$&) | |
str+=($1.to_i(16)).chr | |
x=xa[0] | |
end | |
return [:CSSTRING,str] | |
else | |
return [:HEX,hexval] | |
end | |
elsif x[/\A\%d([0-9]+)-([0-9]+)/] | |
xa[0]=chop(x,$&) | |
return [:HEXRANGE,[$1.to_i(10),$2.to_i(10)]] | |
elsif x[/\A\%d([0-9]+)/] | |
xa[0]=chop(x,$&) | |
hexval=$1.to_i(10) | |
x=xa[0] | |
if x[/\A\.([0-9]+)/] | |
str=hexval.chr | |
while x[/\A\.([0-9]+)/] | |
xa[0]=chop(x,$&) | |
str+=($1.to_i(10)).chr | |
x=xa[0] | |
end | |
return [:CSSTRING,str] | |
else | |
return [:HEX,hexval] | |
end | |
elsif x[/\A\:?\=/] | |
xa[0]=chop(x,$&) | |
return [:EQUALS,nil] | |
elsif x[/\A([0-9]+)/] | |
xa[0]=chop(x,$&) | |
return [:NUMTIMES,$&.to_i] | |
elsif x[/\A([A-Za-z0-9_-]+)/] | |
xa[0]=chop(x,$&) | |
return [:WORD,$&] | |
end | |
return nil | |
end | |
def splitAbnfLines(x) | |
lines=x.split(/\r?\n/) | |
ret=[] | |
curline=nil | |
filebegin="" | |
for line in lines | |
if line[/^\s*([A-Za-z0-9_-]+)\s*\:?\=/] | |
ret.push(curline) if curline && curline.length>0 | |
curline=filebegin+line+"\r\n" | |
filebegin="" | |
elsif curline && curline.length>0 | |
curline+=line+"\r\n" | |
else | |
filebegin+=line+"\r\n" | |
end | |
end | |
ret.push(curline||filebegin) | |
return ret | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment