Last active
May 3, 2024 04:32
-
-
Save eczn/9b417988a67a6ff3790d8b7163a177e4 to your computer and use it in GitHub Desktop.
parser combinator 词法解析器
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
enum Token { | |
Value(Int) | |
LParen; | |
RParen; | |
Plus; | |
Minus; | |
Multiply; | |
Divide; | |
} derive(Debug) | |
type Lexer[V] (String) -> Option[(V, String)] | |
fn parse[V](self: Lexer[V], str: String) -> Option[(V, String)] { | |
(self.0)(str) | |
} | |
fn pchar(predicate: (Char) -> Bool) -> Lexer[Char] { | |
Lexer(fn (input) { | |
if input.length() > 0 && predicate(input[0]) { | |
let start = 1 | |
Some((input[0], input.substring(~start))) | |
} else { | |
None | |
} | |
}) | |
} | |
let symbol: Lexer[Token] = pchar(fn (input) { | |
match input { | |
'+' | '-' | '*' | '/' | '(' | ')' => true | |
_ => false | |
} | |
}).map(fn { | |
'+' => Plus | |
'-' => Minus | |
'*' => Multiply | |
'/' => Divide | |
'(' => LParen | |
')' => RParen | |
}) | |
let whitespace: Lexer[Char] = pchar(fn (input) { | |
match input { | |
' ' => true | |
_ => false | |
} | |
}) | |
fn map[I, O](self: Lexer[I], f: (I) -> O) -> Lexer[O] { | |
Lexer(fn(input) { | |
let (value, rest) = self.parse(input)? | |
Some((f(value), rest)) | |
}) | |
} | |
fn and[V1, V2](self: Lexer[V1], parser2: Lexer[V2]) -> Lexer[(V1, V2)] { | |
Lexer(fn (input) { | |
let (value, rest) = self.parse(input)? | |
let (value2, rest2) = parser2.parse(rest)? | |
Some(((value, value2), rest2)) | |
}) | |
} | |
fn or[Value](self: Lexer[Value], parser2: Lexer[Value]) -> Lexer[Value] { | |
Lexer(fn (input) { | |
match self.parse(input) { | |
None => parser2.parse(input) | |
Some(_) as result => result | |
} | |
}) | |
} | |
fn many[Value](self: Lexer[Value]) -> Lexer[List[Value]] { | |
Lexer(fn (input) { | |
let mut rest = input | |
let mut cumul = List::Nil | |
while true { | |
match self.parse(rest) { | |
None => break | |
Some((value, new_rest)) => { | |
rest = new_rest | |
cumul = Cons(value, cumul) | |
} | |
} | |
} | |
Some((cumul.reverse(), rest)) | |
}) | |
} | |
let zero: Lexer[Int] = pchar(fn { ch => ch == '0' }).map(fn { _ => 0 }) | |
let one_to_nine: Lexer[Int] = | |
pchar(fn { ch => ch.to_int() >= 0x31 && ch.to_int() <= 0x39 }) | |
.map(fn { ch => ch.to_int() - 0x30 }) | |
let zero_to_nine: Lexer[Int] = | |
pchar(fn { ch => ch.to_int() >= 0x30 && ch.to_int() <= 0x39 }) | |
.map(fn { ch => ch.to_int() - 0x30 }) | |
// number = %x30 / (%x31-39) *(%30-39) | |
let value: Lexer[Token] = | |
zero.or( | |
one_to_nine.and( | |
zero_to_nine.many() | |
).map(fn { | |
((i, ls)) => ls.fold_left(fn (acc, cur) { | |
acc * 10 + cur | |
}, init=i) | |
}) | |
).map(Token::Value) | |
pub let tokens: Lexer[List[Token]] = | |
whitespace.many().and(value.or(symbol).and(whitespace.many())) | |
.map(fn { (_, (symbols, _)) => symbols }) | |
.many() | |
fn init { | |
let input = " + 123 313 +- /*22" | |
let Some((result, _)) = tokens.parse(input) | |
debug(result.to_array()) | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// 根据 lexer.mbt 改的 ts 版 | |
type Char = string & { length: 1 }; | |
type Token = ( | |
| { type: 'token-value', number: number } | |
| { type: 'token-plus' } | |
| { type: 'token-minus' } | |
| { type: 'token-multiply' } | |
| { type: 'token-divide' } | |
| { type: 'token-lparen' } | |
| { type: 'token-rparen' } | |
) | |
class Lexer<V> { | |
constructor( | |
public parse: (str: string) => null | [V, string] | |
) {} | |
map<O>(f: (input: V) => O): Lexer<O> { | |
return new Lexer<O>(input => { | |
let res = this.parse(input) | |
if (res === null) return null; | |
const [value, rest] = res; | |
return [f(value), rest] | |
}) | |
} | |
and<V2>(parser2: Lexer<V2>): Lexer<[V, V2]> { | |
return new Lexer(input => { | |
const res1 = this.parse(input) | |
if (res1 === null) return null; | |
const [value1, rest1] = res1; | |
const res2 = parser2.parse(rest1); | |
if (res2 === null) return null; | |
const [value2, rest2] = res2; | |
return [[value1, value2], rest2] | |
}) | |
} | |
or(parser2: Lexer<V>): Lexer<V> { | |
return new Lexer(input => { | |
const res = this.parse(input) | |
if (res === null) return parser2.parse(input) | |
return res; | |
}) | |
} | |
many(): Lexer<V[]> { | |
return new Lexer(input => { | |
let rest = input | |
let cumul: V[] = [] | |
while (true) { | |
const res = this.parse(rest) | |
if (res === null) break; | |
const [value, newRest] = res; | |
rest = newRest; | |
cumul.push(value); | |
} | |
return [cumul, rest]; | |
}) | |
} | |
} | |
function pchar(predicate: (char: Char) => boolean): Lexer<Char> { | |
return new Lexer(input => { | |
if (input.length && predicate(input[0] as Char)) { | |
return [input[0] as Char, input.slice(1)] | |
} else { | |
return null; | |
} | |
}) | |
} | |
let symbol: Lexer<Token> = pchar(input => { | |
if (input === '+') return true | |
if (input === '-') return true | |
if (input === '*') return true | |
if (input === '/') return true | |
if (input === '(') return true | |
if (input === ')') return true | |
return false | |
}).map(char => { | |
if (char === '+') return { type: 'token-plus' } as Token | |
if (char === '-') return { type: 'token-minus' } as Token | |
if (char === '*') return { type: 'token-multiply' } as Token | |
if (char === '/') return { type: 'token-divide' } as Token | |
if (char === '(') return { type: 'token-lparen' } as Token | |
if (char === ')') return { type: 'token-rparen' } as Token | |
throw new Error('不可能走到这里') | |
}) | |
let whitespace: Lexer<Char> = pchar(input => (input === ' ')); | |
let zero: Lexer<number> = pchar(ch => ch === '0').map(i => 0) | |
let oneToNine: Lexer<number> = ( | |
pchar(ch => !!(ch.charCodeAt(0) >= 0x31 && ch.charCodeAt(0) <= 0x39)) | |
.map(ch => ch.charCodeAt(0) - 0x30) | |
) | |
let zeroToNine: Lexer<number> = ( | |
pchar(ch => !!(ch.charCodeAt(0) >= 0x30 && ch.charCodeAt(0) <= 0x39)) | |
.map(ch => ch.charCodeAt(0) - 0x30) | |
) | |
let value: Lexer<Token> = ( | |
zero.or( | |
oneToNine.and( | |
zeroToNine.many() | |
).map(input => { | |
const [i, ls] = input; | |
return ls.reduce((acc, cur) => acc * 10 + cur, i) | |
}) | |
).map(number => ({ type: 'token-value', number })) | |
); | |
let tokens: Lexer<Token[]> = ( | |
whitespace.many().and(value.or(symbol).and(whitespace.many())) | |
.map(input => { | |
const [_0, [token, _1]] = input; | |
return token | |
}) | |
.many() | |
); | |
// test !! | |
(() => { | |
let input = ' + 123 313 +- /*22'; | |
let res = tokens.parse(input) | |
console.log(res); | |
})() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
https://github.com/eczn/se/blob/master/src/parse/index.ts
⬆️ 回头看一下五年前写 lisp 解释器写的 parser,完全比不上这种 parser combinator 这么优雅正确,现在感觉 C style 的那种 getChar 写法丑到完全看不下去了 😂