Last active
May 23, 2022 03:22
-
-
Save Phryxia/c6919658612265b020d8973915e70f1a to your computer and use it in GitHub Desktop.
Regular Language Description Language Recognizer
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| const CHARSET = '01' | |
| const UNRECOGNIZED = -1 | |
| // -1: unrecognized | |
| // other: next token position | |
| type NextIndex = number | typeof UNRECOGNIZED | |
| type Recognizer<T extends any[] = []> = (i: number, ...rest: T) => NextIndex | |
| function parse(s: string): boolean { | |
| const isRecognizedChar: Recognizer<[string]> = (i, allowedCharacters) => { | |
| if (allowedCharacters.includes(s[i])) { | |
| return i + 1 | |
| } | |
| return UNRECOGNIZED | |
| } | |
| const isRecognizedSome: Recognizer<[Iterable<Recognizer>]> = (i, recognizers) => { | |
| for (const recognizer of recognizers) { | |
| const nextIndex = recognizer(i) | |
| if (nextIndex !== UNRECOGNIZED) { | |
| return nextIndex | |
| } | |
| } | |
| return UNRECOGNIZED | |
| } | |
| const isRecognizedExpression: Recognizer = (i) => { | |
| const nextI = isRecognizedSome(i, [isRecognizedUnion, isRecognizedNonUnion]) | |
| if (nextI !== UNRECOGNIZED) { | |
| return nextI | |
| } | |
| return UNRECOGNIZED | |
| } | |
| const isRecognizedUnion: Recognizer = (i) => { | |
| const nextI = isRecognizedNonUnion(i) | |
| if (nextI !== UNRECOGNIZED) { | |
| const nextNextI = isRecognizedChar(nextI, '|') | |
| if (nextNextI !== UNRECOGNIZED) { | |
| return isRecognizedExpression(nextNextI) | |
| } | |
| return UNRECOGNIZED | |
| } | |
| return UNRECOGNIZED | |
| } | |
| const isRecognizedNonUnion: Recognizer = (i) => { | |
| const nextI = isRecognizedSome(i, [ | |
| isRecognizedStar, | |
| (i) => isRecognizedChar(i, CHARSET), | |
| isRecognizedGroup, | |
| ]) | |
| if (nextI !== UNRECOGNIZED) { | |
| const nextNextI = isRecognizedNonUnion(nextI) | |
| if (nextNextI !== UNRECOGNIZED) { | |
| return nextNextI | |
| } | |
| return nextI | |
| } | |
| return UNRECOGNIZED | |
| } | |
| const isRecognizedStar: Recognizer = (i) => { | |
| const nextI = isRecognizedSome(i, [ | |
| (i) => isRecognizedChar(i, CHARSET), | |
| isRecognizedGroup, | |
| ]) | |
| if (nextI !== UNRECOGNIZED) { | |
| const nextNextI = isRecognizedChar(nextI, '*') | |
| if (nextNextI !== UNRECOGNIZED) { | |
| return nextNextI | |
| } | |
| return UNRECOGNIZED | |
| } | |
| return UNRECOGNIZED | |
| } | |
| const isRecognizedGroup: Recognizer = (i) => { | |
| let nextI = isRecognizedChar(i, '(') | |
| if (nextI !== UNRECOGNIZED) { | |
| nextI = isRecognizedExpression(nextI) | |
| if (nextI !== UNRECOGNIZED) { | |
| return isRecognizedChar(nextI, ')') | |
| } | |
| return UNRECOGNIZED | |
| } | |
| return UNRECOGNIZED | |
| } | |
| return isRecognizedExpression(0) === s.length | |
| } | |
| if (parse('(01|10)*|()')) { | |
| console.log('it is regular expression') | |
| } else { | |
| console.log('it is not regular expression') | |
| } |
Author
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This is very inefficient backtracking based LL parser, which recognizes regular expression with binary alphabet.
Note that this doesn't ignore whitespaces.
This code itself is not useful, but I did it because I'd like some ideas of applying high order function. I've tried not to use impure functions like
getchar(which was common during my university's undergraduate compiler lecture..)Here is the EBNF for recognizing languages. Note that, with LL parser, the order of non terminal symbols in right hand side DOES matter. If some non terminal recognizes
awhile other recognizesab, the later one must be examined first. It's not the problem of ambiguity, rather it's more likely to robustness while preserving the performance. (Considerabcwhileab,bc,ais valid butcis not)