Last active
August 3, 2023 07:00
-
-
Save GibsonRuitiari/62df950d2d4ba512d3dd79037923af15 to your computer and use it in GitHub Desktop.
LLR Parser for regex exp
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* a fully fledged parser than parses regex expressions into AST on the fly | |
* note the parser is an LL(1), and the grammar used is context-free | |
* right recursive. | |
* Also, the parser does not tokenize the inputs thus making it a little | |
* faster than those that tokenize their inputs. | |
* Currently, it parses all regexes apart from those having | |
* back-references & Zero-Width Positive Lookahead Assertions '?=' | |
* For reference see: https://en.wikipedia.org/wiki/Compilers:_Principles,_Techniques,_and_Tools | |
* */ | |
fun main() { | |
// pervasive regexes gotten from https://digitalfortress.tech/tips/top-15-commonly-used-regex/ | |
val simpleRegex ="^-?\\d*(\\.\\d+)?\$" | |
val emailRegex="^([a-zA-Z0-9._%-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,6})*$" | |
val telephoneRegex="^[-]?[0-9]+[,.]?[0-9]*([\\/][0-9]+[,.]?[0-9]*)*\$" | |
val passwordRegex="([!@#\$%^&*()[\\]{}\\-_+=~`|:;\"'<>,.?])" | |
val colorHexRegex = "^#([A-Fa-f0-9]{6}|[A-Fa-f0-9]{3})\$" | |
val wholeDecimalFractions ="[-]?[0-9]+[,.]?[0-9]*([\\/][0-9]+[,.]?[0-9]*)*" | |
val complexPasswordRegex ="(?:(.*[0-9]))(.*[\\!@#\$%^&*()[]{}\\-_+=~`|:;\"'<>,./?])(.*[a-z])(?:(.*[A-Z]))((.*)).{8,}" | |
val urlHttpProtocol = "https?:\\/\\/(www\\.)?[-a-zA-Z0-9@:%._\\+~#=]{2,256}\\.[a-z]{2,6}\\b([-a-zA-Z0-9@:%_\\+.~#()?&//=]*)" | |
// ([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])$ | |
val ipv4Address="(([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\\.){3}\$" | |
val timeFormat ="(?:[01]\\d|2[0123]):(?:[012345]\\d):(?:[012345]\\d)" | |
val matched=Grammar.getAst().parse(timeFormat) | |
println(matched?.matched) | |
} | |
interface Parser<T>{ | |
// returns matched item and the remaining literals | |
fun parse(input:String):ParseResult<T>? | |
} | |
data class ParseResult<T>(val remainingLiterals:String,val matched:T) | |
object Parsers{ | |
fun consumePrefix(prefix:Char):Parser<Unit>{ | |
return object :Parser<Unit>{ | |
override fun parse(input: String): ParseResult<Unit>? { | |
return if (input.isEmpty()) null | |
else if (input.first()==prefix) ParseResult(remainingLiterals = input.drop(1), matched = Unit) | |
else null | |
} | |
} | |
} | |
fun consumePrefix(prefix:String):Parser<Unit>{ | |
return object :Parser<Unit>{ | |
override fun parse(input: String): ParseResult<Unit>? { | |
return if (input.isEmpty()) null | |
else if (input.first() in prefix) ParseResult(remainingLiterals = input.drop(1), matched = Unit) | |
else null | |
} | |
} | |
} | |
fun consumeCharacterExcluding(exclude:String):Parser<LetterLiteral>{ | |
return object :Parser<LetterLiteral>{ | |
override fun parse(input: String): ParseResult<LetterLiteral>? { | |
return if (input.isEmpty()) return null | |
else{ | |
if (input in exclude) return null | |
val consumed=consumeCharacter().parse(input) ?: return null | |
if (!exclude.contains(consumed.matched.letter)){ | |
ParseResult(remainingLiterals = consumed.remainingLiterals, | |
matched = consumed.matched) | |
}else null | |
} | |
} | |
} | |
} | |
fun consumeCharacter():Parser<LetterLiteral>{ | |
return object :Parser<LetterLiteral>{ | |
override fun parse(input: String): ParseResult<LetterLiteral>? { | |
return if (input.isEmpty()) null | |
else { | |
val characterToBeConsumed = input.first() | |
ParseResult(remainingLiterals = input.drop(1), matched = LetterLiteral(characterToBeConsumed)) | |
} | |
} | |
} | |
} | |
// natural numbers or zero | |
fun consumeNumber():Parser<IntegerLiteral>{ | |
return object :Parser<IntegerLiteral>{ | |
override fun parse(input: String): ParseResult<IntegerLiteral>? { | |
var remainingLiterals=input | |
val digits = StringBuilder() | |
while (true){ | |
if (remainingLiterals.isEmpty() || | |
remainingLiterals.isBlank() || | |
remainingLiterals.firstOrNull()?.isDigit()==false) break | |
val result=consumeDigit().parse(remainingLiterals) ?: break | |
val digit = result.matched.integer | |
digits.append(digit) | |
remainingLiterals=result.remainingLiterals | |
} | |
return try { | |
val number =digits.toString().toInt() | |
ParseResult(remainingLiterals = remainingLiterals, | |
matched = IntegerLiteral(number)) | |
}catch (ex:NumberFormatException){ | |
println("failed to convert the string to number") | |
null | |
} | |
} | |
} | |
} | |
// consume just one digit | |
fun consumeDigit():Parser<IntegerLiteral>{ | |
return object :Parser<IntegerLiteral>{ | |
override fun parse(input: String): ParseResult<IntegerLiteral>? { | |
return if (input.isEmpty()) null | |
else { | |
val charToBeConsumed=input.first() | |
if (charToBeConsumed.isDigit()) ParseResult(matched=IntegerLiteral(charToBeConsumed.digitToInt()), | |
remainingLiterals = input.drop(1)) else null | |
} | |
} | |
} | |
} | |
} | |
// misc utils that makes it easier to pass the tokens into our | |
// corresponding grammar | |
fun<A> oneOf(vararg parsers:Parser<A>):Parser<A>{ | |
return object :Parser<A>{ | |
override fun parse(input: String): ParseResult<A>? { | |
val matched= parsers.firstOrNull { it.parse(input)!=null }?.parse(input) ?: return null | |
return ParseResult(matched = matched.matched, | |
remainingLiterals = matched.remainingLiterals | |
) | |
} | |
} | |
} | |
inline fun<A,B> Parser<A>.map(crossinline transform:(A)->B): Parser<B> { | |
return object :Parser<B>{ | |
override fun parse(input: String): ParseResult<B>? { | |
val (remaining,matched)[email protected](input) ?: return null | |
return ParseResult(remainingLiterals = remaining,transform(matched)) | |
} | |
} | |
} | |
// defines our grammar according to our Grammar.ebnf | |
object Grammar{ | |
fun getAst():Parser<AST>{ | |
return object :Parser<AST>{ | |
override fun parse(input: String): ParseResult<AST>? { | |
val isFromStartingResult=Parsers.consumePrefix('^').parse(input) | |
val isFromStarting = isFromStartingResult!=null | |
val remainingLiterals=isFromStartingResult?.remainingLiterals ?: input | |
val expressionResult= getExpression().parse(remainingLiterals) ?:return null | |
// we have reached the end? if yes return the result if no just return null since | |
// the expression is unrecognized by our parser | |
require(expressionResult.remainingLiterals.isEmpty() || expressionResult.remainingLiterals==")"){"expression must be empty or must not end with an unescaped )"} | |
return ParseResult(matched = AST(root = expressionResult.matched, isFromStartOfString = isFromStarting), | |
remainingLiterals = expressionResult.remainingLiterals) | |
} | |
} | |
} | |
private val lazyExpression by lazy { getExpression() } | |
fun getExpression():Parser<Expression>{ | |
return object :Parser<Expression>{ | |
override fun parse(input: String): ParseResult<Expression>? { | |
val leftSubExpression=getSubexpression().parse(input) ?: return null | |
val alternation = Parsers.consumePrefix('|').parse(leftSubExpression.remainingLiterals) | |
return if (alternation==null){ | |
ParseResult(matched =Expression( | |
subExpression = leftSubExpression.matched, | |
rightAlternativeExpression = null), | |
remainingLiterals = leftSubExpression.remainingLiterals) | |
}else{ | |
val rightExpression = getExpression().parse(alternation.remainingLiterals) ?: throw IllegalArgumentException("a valid expression must follow after the |") | |
println("right expression sub ---> ${rightExpression.matched.subExpression} | alternative-- ${rightExpression.matched.rightAlternativeExpression}") | |
println("----") | |
ParseResult(matched =Expression( | |
subExpression = leftSubExpression.matched, | |
rightAlternativeExpression = rightExpression.matched), | |
remainingLiterals = rightExpression.remainingLiterals) | |
} | |
} | |
} | |
} | |
fun getSubexpression():Parser<SubExpression>{ | |
// one or more [+] | |
return object :Parser<SubExpression>{ | |
override fun parse(input: String): ParseResult<SubExpression> { | |
val matches = mutableListOf<SubExpressionItem>() | |
var remaining=input | |
while (true){ | |
// if it starts with an unescaped | then it means it is another sub-expression so break and return | |
// the remaining literals will be evaluated on their own | |
if (remaining.isEmpty() || remaining.isBlank()) break | |
val result=getSubexpressionItem().parse(remaining) ?: break | |
remaining=result.remainingLiterals | |
matches.add(result.matched) | |
} | |
check(matches.isNotEmpty()) {"pattern must not be empty!"} | |
return ParseResult(matched = SubExpression(matches), | |
remainingLiterals =remaining) | |
} | |
} | |
} | |
fun getSubexpressionItem():Parser<SubExpressionItem>{ | |
return object : Parser<SubExpressionItem>{ | |
// one of group,anchor, match, character | |
override fun parse(input: String): ParseResult<SubExpressionItem>? { | |
val subExpressionItemResult= oneOf( | |
getMatch().map { SubexpressionMatch(it)}, | |
getGroup().map { SubexpressionGroup(it) }, | |
getAnchor().map { | |
SubexpressionAnchor(it)},).parse(input) ?: return null | |
return ParseResult(matched = subExpressionItemResult.matched, | |
remainingLiterals = subExpressionItemResult.remainingLiterals) | |
} | |
} | |
} | |
fun getGroup():Parser<QuantifiedGroup>{ | |
return object :Parser<QuantifiedGroup>{ | |
override fun parse(input: String): ParseResult<QuantifiedGroup>? { | |
var _input = input | |
val openingParenthesis = Parsers.consumePrefix("(").parse(_input)?:return null | |
val isNonCapturing=openingParenthesis.remainingLiterals.take(2) == "?:" | |
_input = if (isNonCapturing) openingParenthesis.remainingLiterals.drop(2) | |
else openingParenthesis.remainingLiterals | |
val group= lazyExpression.parse(_input)?:return null | |
_input=group.remainingLiterals | |
val optionalQuantifier=getQuantifier().parse(_input) | |
_input = optionalQuantifier?.remainingLiterals ?: _input | |
val closingParenthesis = Parsers.consumePrefix("\\)").parse(_input) ?:return null | |
_input=closingParenthesis.remainingLiterals | |
// the group is followed by a quantifier (maybe) | |
val optionalGroupQuantifier=getQuantifier().parse(_input) | |
group.matched | |
val output=Group(isGroupCapturing = !isNonCapturing, expression = group.matched, | |
quantifierType = optionalQuantifier?.matched) | |
return ParseResult(matched=QuantifiedGroup(group = output, | |
quantifier = optionalGroupQuantifier?.matched), | |
remainingLiterals = optionalGroupQuantifier?.remainingLiterals ?: _input) | |
} | |
} | |
} | |
fun getMatch(): Parser<Match> { | |
return object :Parser<Match>{ | |
override fun parse(input: String): ParseResult<Match>? { | |
if (input.isEmpty()) return null | |
val result=oneOf(Parsers.consumePrefix('.').map { MatchAnyCharacter()}, | |
getCharacterGroup().map { MatchCharacterGroup(it)}, | |
getCharacterClass().map { MatchCharacterClass(it)}, | |
// exclude these characters because they will be handled by the respective handlers | |
// including them will make the parser go in a never-ending loop :) | |
Parsers.consumeCharacterExcluding("\\()|*?+$").map { | |
MatchCharacter(it) | |
}, | |
getEscapedCharacter().map { MatchCharacter(it)}).parse(input) ?: return null | |
val optionalQuantifier=getQuantifier().parse(result.remainingLiterals) | |
val remainingLiterals = optionalQuantifier?.remainingLiterals ?: result.remainingLiterals | |
return ParseResult(remainingLiterals =remainingLiterals, | |
matched = Match(matchItem = result.matched,quantifier = optionalQuantifier?.matched)) | |
} | |
} | |
} | |
fun getCharacterGroup():Parser<CharacterGroup>{ | |
return object :Parser<CharacterGroup> { | |
override fun parse(input: String): ParseResult<CharacterGroup>? { | |
val openingParenthesisResult = Parsers.consumePrefix('[').parse(input) ?: return null | |
val optionalInvertedMarkerResult = Parsers.consumePrefix('^') | |
.parse(openingParenthesisResult.remainingLiterals) | |
val isGroupInverted = optionalInvertedMarkerResult!=null | |
var remainingLiterals = optionalInvertedMarkerResult?.remainingLiterals ?: openingParenthesisResult.remainingLiterals | |
val matches = mutableListOf<CharacterGroupItem>() | |
while (true){ | |
// if it starts with then it means we have a quantifier next so break out of the loop and consume the ] then return the quantifier | |
if (remainingLiterals.isEmpty() || remainingLiterals.isBlank() || remainingLiterals.startsWith(']')) break | |
val characterGroupItemResult = getCharacterGroupItem().parse(remainingLiterals) ?: break | |
remainingLiterals = characterGroupItemResult.remainingLiterals | |
matches.add(characterGroupItemResult.matched) | |
} | |
check(matches.isNotEmpty()){"character group cannot be empty!"} | |
val closingParenthesisResult=Parsers.consumePrefix(']').parse(remainingLiterals) ?: throw IllegalArgumentException("character group must end with a ]") | |
return ParseResult(remainingLiterals = closingParenthesisResult.remainingLiterals, | |
matched = CharacterGroup(isInverted = isGroupInverted, | |
groupItems = matches)) | |
} | |
} | |
} | |
// one of either character range character class or character | |
fun getCharacterGroupItem():Parser<CharacterGroupItem>{ | |
return object :Parser<CharacterGroupItem>{ | |
override fun parse(input: String): ParseResult<CharacterGroupItem>? { | |
// val unescapedBackslash=Parsers.consumePrefix("/").parse(input) | |
// if (unescapedBackslash!=null) throw IllegalArgumentException("an unescaped delimiter must be escaped with a backslash") | |
val result=oneOf(getCharacterClass().map { it }, | |
getCharacterRange().map { it }, | |
getDigitRange().map{it}, | |
getEscapedCharacter().map { Character(it.letter) }, | |
Parsers.consumeCharacterExcluding("]") | |
.map { Character(it.letter) }).parse(input) ?: return null | |
return ParseResult(remainingLiterals = result.remainingLiterals, | |
matched = result.matched) | |
} | |
} | |
} | |
fun getEscapedCharacter():Parser<LetterLiteral>{ | |
return object :Parser<LetterLiteral>{ | |
override fun parse(input: String): ParseResult<LetterLiteral>? { | |
val escapedBackslash=Parsers.consumePrefix('\\').parse(input) ?: return null | |
if (escapedBackslash.remainingLiterals.isEmpty()) throw IllegalArgumentException("dude you cannot end pattern with a '\\'") | |
val consumedCharacterResult=Parsers.consumeCharacter().parse(escapedBackslash.remainingLiterals) ?: return null | |
consumedCharacterResult.matched | |
return ParseResult(matched = consumedCharacterResult.matched, | |
remainingLiterals = consumedCharacterResult.remainingLiterals) | |
} | |
} | |
} | |
//[0-9] | |
fun getDigitRange():Parser<DigitRange>{ | |
return object :Parser<DigitRange>{ | |
override fun parse(input: String): ParseResult<DigitRange>? { | |
val openingBracketResult=Parsers.consumePrefix('[').parse(input) | |
val lowerBoundResult=if (openingBracketResult==null){ | |
Parsers.consumeCharacter().parse(input) ?: return null | |
}else{ | |
Parsers.consumeCharacter().parse(openingBracketResult.remainingLiterals) ?: return null | |
} | |
val dashResult= Parsers.consumePrefix('-').parse(lowerBoundResult.remainingLiterals) ?: return null | |
val upperBoundResult = Parsers.consumeCharacter().parse(dashResult.remainingLiterals) ?: return null | |
val remainingLiterals=if (openingBracketResult!=null){ | |
val result= Parsers.consumePrefix(']').parse(upperBoundResult.remainingLiterals ) ?: return null | |
result.remainingLiterals | |
}else{ | |
upperBoundResult.remainingLiterals | |
} | |
if (upperBoundResult.matched.letter !in digits() || lowerBoundResult.matched.letter !in digits()) return null | |
if (upperBoundResult.matched.letter <= lowerBoundResult.matched.letter) throw IllegalArgumentException("upper bound must be greater than lower bound") | |
return ParseResult(remainingLiterals = remainingLiterals, | |
matched = DigitRange(IntegerLiteral(lowerBoundResult.matched.letter.digitToInt()).. | |
IntegerLiteral(upperBoundResult.matched.letter.digitToInt()))) | |
} | |
} | |
} | |
// [a-z] | |
fun getCharacterRange():Parser<CharacterRange>{ | |
return object :Parser<CharacterRange>{ | |
override fun parse(input: String): ParseResult<CharacterRange>? { | |
val openingBracketResult=Parsers.consumePrefix('[').parse(input) | |
val lowerBoundResult=if (openingBracketResult==null){ | |
Parsers.consumeCharacter().parse(input) ?: return null | |
}else{ | |
Parsers.consumeCharacter().parse(openingBracketResult.remainingLiterals) ?: return null | |
} | |
val dashResult= Parsers.consumePrefix('-').parse(lowerBoundResult.remainingLiterals) ?: return null | |
val upperBoundResult = Parsers.consumeCharacter().parse(dashResult.remainingLiterals) ?: return null | |
val remainingLiterals=if (openingBracketResult!=null){ | |
val result= Parsers.consumePrefix(']').parse(upperBoundResult.remainingLiterals ) ?: return null | |
result.remainingLiterals | |
}else{ | |
upperBoundResult.remainingLiterals | |
} | |
if (upperBoundResult.matched.letter !in letters() || lowerBoundResult.matched.letter !in letters()) return null | |
if (upperBoundResult.matched.letter <= lowerBoundResult.matched.letter) throw IllegalArgumentException("upper bound must be greater than lower bound") | |
return ParseResult(remainingLiterals=remainingLiterals, | |
matched = CharacterRange(LetterLiteral(lowerBoundResult.matched.letter)..LetterLiteral(upperBoundResult.matched.letter))) | |
} | |
} | |
} | |
fun getCharacterClass():Parser<CharacterClass>{ | |
return object :Parser<CharacterClass>{ | |
override fun parse(input: String): ParseResult<CharacterClass>? { | |
val prefixResult= Parsers.consumePrefix('\\').parse(input) ?: return null | |
val consumedCharResult=Parsers.consumeCharacter().parse(prefixResult.remainingLiterals) | |
return if (consumedCharResult?.matched!=null){ | |
val characterClass = makeCharacterClass(consumedCharResult.matched.letter) | |
if (characterClass!=null) | |
ParseResult(remainingLiterals = consumedCharResult.remainingLiterals, | |
matched = characterClass) | |
else null | |
}else null | |
} | |
} | |
} | |
private fun makeCharacterClass(char: Char):CharacterClass?{ | |
return when(char){ | |
'd'-> CharacterClassAnyDecimalDigit() | |
'D'->CharacterClassAnyNonDecimalDigit() | |
's'->CharacterClassAnyWhiteSpace() | |
'S'->CharacterClassAnyNonWhiteSpace() | |
'w'->CharacterClassAnyWord() | |
'W'->CharacterClassAnyNonWord() | |
else->null | |
} | |
} | |
// * + ? {2,4} | |
fun getQuantifier():Parser<Quantifier>{ | |
return object :Parser<Quantifier>{ | |
override fun parse(input: String): ParseResult<Quantifier>? { | |
if (input.isEmpty()) return null | |
val quantifiers=oneOf(Parsers.consumePrefix('*').map {ZeroOrMoreQuantifier}, | |
Parsers.consumePrefix('+').map {OneOrMoreQuantifier }, | |
Parsers.consumePrefix('?').map { ZeroOrOneQuantifier }, | |
getRangeQuantifier().map { it }).parse(input) ?: return null | |
val isQuantifierLazy =Parsers.consumePrefix('?').parse(quantifiers.remainingLiterals) | |
val remainingLiterals = isQuantifierLazy?.remainingLiterals ?: quantifiers.remainingLiterals | |
return ParseResult(remainingLiterals = remainingLiterals, | |
matched = Quantifier(isQuantifierLazy =isQuantifierLazy!=null, | |
quantifierType = quantifiers.matched)) | |
} | |
} | |
} | |
// {2,4} {2} {4,} | |
fun getRangeQuantifier():Parser<RangeQuantifier>{ | |
return object :Parser<RangeQuantifier>{ | |
override fun parse(input: String): ParseResult<RangeQuantifier>? { | |
val bracketPrefix= Parsers.consumePrefix('{').parse(input) ?: return null | |
var remaining= bracketPrefix.remainingLiterals | |
val lowerBoundResult= Parsers.consumeNumber().parse(remaining) ?: return null | |
remaining=lowerBoundResult.remainingLiterals | |
val lowerBound= lowerBoundResult.matched.integer | |
// ,3 --> optional | |
val parsedComma=Parsers.consumePrefix(',').parse(remaining) | |
return if (parsedComma==null){ | |
// we do not have an upper bound we thus create the range quantifier here | |
// check that we have {1 -we are here check if we have a } | |
val closingBracketResult=Parsers.consumePrefix('}').parse(remaining) ?: throw IllegalArgumentException("missing a }!") | |
remaining=closingBracketResult.remainingLiterals | |
ParseResult(remainingLiterals = remaining, matched = makeRangeQuantifier(lowerBound,null)) | |
}else{ | |
// we do have upper bound | |
remaining=parsedComma.remainingLiterals | |
val upperBoundResult=Parsers.consumeNumber().parse(remaining) | |
upperBoundResult?.let { | |
remaining=it.remainingLiterals | |
} | |
// means we just had a comma if the upper bound is null | |
// if it's null -- {2,} | |
val upperBound = upperBoundResult?.matched?.integer ?:lowerBound | |
check(upperBound>=lowerBound){"upper bound needs to be greater or equal to the lower bound not the other way round"} | |
val closingBracketResult = Parsers.consumePrefix('}').parse(remaining) ?: throw IllegalArgumentException("missing a }!") | |
ParseResult(remainingLiterals = closingBracketResult.remainingLiterals, | |
matched = makeRangeQuantifier(lowerBound, upperBound)) | |
} | |
} | |
} | |
} | |
private fun makeRangeQuantifier(lowerBound: Int,upperBound: Int?):RangeQuantifier{ | |
return RangeQuantifier(lowerBound = lowerBound, upperBound = upperBound?:lowerBound) | |
} | |
// one of escaped anchor/ end of string anchor | |
fun getAnchor():Parser<Anchor>{ | |
return object :Parser<Anchor>{ | |
override fun parse(input: String): ParseResult<Anchor>? { | |
val remainingLiterals = StringBuilder() | |
val anchor=if (input.startsWith("\\")){ | |
val result=getEscapedAnchor(input) | |
if (result!=null){ | |
val (remaining,matched) = result | |
val escapedAnchor=when(matched.letter){ | |
'B'->AnchorNonWordBoundary() | |
'b'->AnchorWordBoundary() | |
'z'->AnchorEndOfStringNotNewline() | |
'A'->AnchorStartOfStringOnly() | |
'Z'->AnchorEndOfStringOnly() | |
else->null | |
} | |
remainingLiterals.append(remaining) | |
escapedAnchor | |
}else null | |
}else if (input.startsWith("$")){ | |
val endOfStringAnchor=Parsers.consumePrefix('$').parse(input) | |
// what about (?:[a-b]$) ?is this a valid expression? For now we treat it as such | |
// because it does make sense logically | |
// if(endOfStringAnchor?.remainingLiterals?.isNotEmpty() == true) throw IllegalArgumentException("$ cannot be used in the middle of the expression") | |
if(endOfStringAnchor?.matched!=null){ | |
AnchorEndOfString() | |
}else null | |
}else null | |
return if (anchor==null) null else ParseResult(remainingLiterals = remainingLiterals.toString(), matched = anchor) | |
} | |
} | |
} | |
private fun getEscapedAnchor(input: String): ParseResult<LetterLiteral>? { | |
val parserResult = Parsers.consumePrefix('\\').parse(input) ?: return null | |
return Parsers.consumeCharacter().parse(parserResult.remainingLiterals) | |
} | |
} | |
// our tree containing root/top node | |
data class AST(val isFromStartOfString:Boolean,val root:Expression) | |
// our ast types which are basically our return types after parsing | |
// this is the base grammar type whereby it represents a single unit of grammar | |
sealed interface GrammarType | |
// this defines a composite of grammar types whereby it represents a collection of grammar | |
// the major expression which is made up of either a single expression | |
// or a single + another whereby the another = right alternative expression i.e | |
// exp|exp or exp | |
data class Expression( | |
val subExpression: SubExpression, val rightAlternativeExpression: Expression?=null):GrammarType | |
data class SubExpression(val subExpressionItems:List<SubExpressionItem>):GrammarType | |
sealed interface SubExpressionItem | |
data class SubexpressionMatch(val match: Match):SubExpressionItem | |
data class SubexpressionGroup(val group: QuantifiedGroup):SubExpressionItem | |
data class SubexpressionAnchor(val anchor:Anchor):SubExpressionItem | |
data class QuantifiedGroup(val group:Group,val quantifier: Quantifier?=null):GrammarType | |
data class Group(val isGroupCapturing:Boolean, | |
val expression: Expression, | |
val quantifierType: Quantifier?=null):GrammarType | |
data class Match(val matchItem: MatchItem, val quantifier:Quantifier?=null):GrammarType | |
sealed interface MatchItem | |
data class MatchAnyCharacter(val anyCharacterType:String="."):MatchItem | |
data class MatchCharacterClass(val characterClass: CharacterClass):MatchItem | |
data class MatchCharacter(val character:LetterLiteral):MatchItem | |
data class MatchCharacterGroup(val characterGroup:CharacterGroup):MatchItem | |
data class CharacterGroup(val groupItems:List<CharacterGroupItem>,val isInverted:Boolean):GrammarType | |
sealed interface CharacterGroupItem | |
sealed interface CharacterClass:CharacterGroupItem | |
data class CharacterClassAnyWord(val characterClassTypeValue: String=word()):CharacterClass | |
data class CharacterClassAnyNonWord(val characterClassTypeValue: String=nonWord()):CharacterClass | |
data class CharacterClassAnyDecimalDigit(val characterClassTypeValue: String=digits()):CharacterClass | |
data class CharacterClassAnyNonDecimalDigit(val characterClassTypeValue: String=letters()+punctuation()) | |
:CharacterClass | |
data class CharacterClassAnyWhiteSpace(val characterClassTypeValue: String=whitespace()):CharacterClass | |
data class CharacterClassAnyNonWhiteSpace(val characterClassTypeValue: String=nonWhitespace()):CharacterClass | |
data class CharacterRange(val range:ClosedRange<LetterLiteral>):CharacterGroupItem | |
data class DigitRange(val range:ClosedRange<IntegerLiteral>):CharacterGroupItem | |
data class Character(val char:Char):CharacterGroupItem | |
// quantifier | |
data class Quantifier(val quantifierType: QuantifierType,val isQuantifierLazy:Boolean):GrammarType | |
// quantifier types | |
sealed interface QuantifierType:GrammarType | |
object ZeroOrMoreQuantifier:QuantifierType | |
object OneOrMoreQuantifier:QuantifierType | |
object ZeroOrOneQuantifier:QuantifierType | |
// range quantifier | |
data class RangeQuantifier(val upperBound:Int,val lowerBound:Int):QuantifierType | |
// Anchors - the anchor type value is unescaped | |
sealed class Anchor(open val anchorTypeValue:String):GrammarType | |
data class AnchorWordBoundary(override val anchorTypeValue: String = "b"):Anchor(anchorTypeValue) | |
data class AnchorNonWordBoundary(override val anchorTypeValue: String = "B"):Anchor(anchorTypeValue) | |
data class AnchorEndOfString(override val anchorTypeValue: String = "$"):Anchor(anchorTypeValue) | |
data class AnchorEndOfStringNotNewline(override val anchorTypeValue: String = "z"):Anchor(anchorTypeValue) | |
data class AnchorEndOfStringOnly(override val anchorTypeValue: String = "Z"):Anchor(anchorTypeValue) | |
data class AnchorStartOfStringOnly(override val anchorTypeValue: String = "A"):Anchor(anchorTypeValue) | |
// basic grammar type in this case literals | |
// [0-9]+ | |
data class IntegerLiteral(val integer:Int):GrammarType,Comparable<IntegerLiteral>{ | |
override fun compareTo(other: IntegerLiteral): Int { | |
return if (integer == other.integer)0 | |
else if (integer < other.integer)-1 | |
else 1 | |
} | |
} | |
// letters [a-zA-z]+ or just a single one | |
data class LetterLiteral(val letter:Char):GrammarType,Comparable<LetterLiteral>{ | |
override fun compareTo(other: LetterLiteral): Int { | |
return if (letter ==other.letter) 0 | |
else if (letter < other.letter) -1 | |
else 1 | |
} | |
} | |
// CharacterSets not defined by standard library (probably they should?) | |
fun punctuation() ="!\\\"#\$%&'()*+,-./:;<=>?@[\\\\]^_`{|}~" | |
fun whitespace() =" \\t\\n\\r\\f" | |
fun word() =letters()+digits()+"-" | |
fun nonWord()= (32..126).joinToString(""){ it.toChar().toString() }.filterNot { it in word() } | |
fun nonWhitespace() =(32..126).joinToString("") { it.toChar().toString() }.filterNot { it.isWhitespace() } | |
fun digits() = (0..9).joinToString("") | |
fun letters() = lowerCase()+upperCase() | |
fun lowerCase() = (97..122).joinToString("") { it.toChar().toString() } | |
fun upperCase() = (65..90).joinToString("") { it.toChar().toString()} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment