Skip to content

Instantly share code, notes, and snippets.

@eczn
Last active May 3, 2024 04:32
Show Gist options
  • Save eczn/9b417988a67a6ff3790d8b7163a177e4 to your computer and use it in GitHub Desktop.
Save eczn/9b417988a67a6ff3790d8b7163a177e4 to your computer and use it in GitHub Desktop.
parser combinator 词法解析器
enum Token {
Value(Int)
LParen;
RParen;
Plus;
Minus;
Multiply;
Divide;
} derive(Debug)
type Lexer[V] (String) -> Option[(V, String)]
fn parse[V](self: Lexer[V], str: String) -> Option[(V, String)] {
(self.0)(str)
}
fn pchar(predicate: (Char) -> Bool) -> Lexer[Char] {
Lexer(fn (input) {
if input.length() > 0 && predicate(input[0]) {
let start = 1
Some((input[0], input.substring(~start)))
} else {
None
}
})
}
let symbol: Lexer[Token] = pchar(fn (input) {
match input {
'+' | '-' | '*' | '/' | '(' | ')' => true
_ => false
}
}).map(fn {
'+' => Plus
'-' => Minus
'*' => Multiply
'/' => Divide
'(' => LParen
')' => RParen
})
let whitespace: Lexer[Char] = pchar(fn (input) {
match input {
' ' => true
_ => false
}
})
fn map[I, O](self: Lexer[I], f: (I) -> O) -> Lexer[O] {
Lexer(fn(input) {
let (value, rest) = self.parse(input)?
Some((f(value), rest))
})
}
fn and[V1, V2](self: Lexer[V1], parser2: Lexer[V2]) -> Lexer[(V1, V2)] {
Lexer(fn (input) {
let (value, rest) = self.parse(input)?
let (value2, rest2) = parser2.parse(rest)?
Some(((value, value2), rest2))
})
}
fn or[Value](self: Lexer[Value], parser2: Lexer[Value]) -> Lexer[Value] {
Lexer(fn (input) {
match self.parse(input) {
None => parser2.parse(input)
Some(_) as result => result
}
})
}
fn many[Value](self: Lexer[Value]) -> Lexer[List[Value]] {
Lexer(fn (input) {
let mut rest = input
let mut cumul = List::Nil
while true {
match self.parse(rest) {
None => break
Some((value, new_rest)) => {
rest = new_rest
cumul = Cons(value, cumul)
}
}
}
Some((cumul.reverse(), rest))
})
}
let zero: Lexer[Int] = pchar(fn { ch => ch == '0' }).map(fn { _ => 0 })
let one_to_nine: Lexer[Int] =
pchar(fn { ch => ch.to_int() >= 0x31 && ch.to_int() <= 0x39 })
.map(fn { ch => ch.to_int() - 0x30 })
let zero_to_nine: Lexer[Int] =
pchar(fn { ch => ch.to_int() >= 0x30 && ch.to_int() <= 0x39 })
.map(fn { ch => ch.to_int() - 0x30 })
// number = %x30 / (%x31-39) *(%30-39)
let value: Lexer[Token] =
zero.or(
one_to_nine.and(
zero_to_nine.many()
).map(fn {
((i, ls)) => ls.fold_left(fn (acc, cur) {
acc * 10 + cur
}, init=i)
})
).map(Token::Value)
pub let tokens: Lexer[List[Token]] =
whitespace.many().and(value.or(symbol).and(whitespace.many()))
.map(fn { (_, (symbols, _)) => symbols })
.many()
fn init {
let input = " + 123 313 +- /*22"
let Some((result, _)) = tokens.parse(input)
debug(result.to_array())
}
// 根据 lexer.mbt 改的 ts 版
type Char = string & { length: 1 };
type Token = (
| { type: 'token-value', number: number }
| { type: 'token-plus' }
| { type: 'token-minus' }
| { type: 'token-multiply' }
| { type: 'token-divide' }
| { type: 'token-lparen' }
| { type: 'token-rparen' }
)
class Lexer<V> {
constructor(
public parse: (str: string) => null | [V, string]
) {}
map<O>(f: (input: V) => O): Lexer<O> {
return new Lexer<O>(input => {
let res = this.parse(input)
if (res === null) return null;
const [value, rest] = res;
return [f(value), rest]
})
}
and<V2>(parser2: Lexer<V2>): Lexer<[V, V2]> {
return new Lexer(input => {
const res1 = this.parse(input)
if (res1 === null) return null;
const [value1, rest1] = res1;
const res2 = parser2.parse(rest1);
if (res2 === null) return null;
const [value2, rest2] = res2;
return [[value1, value2], rest2]
})
}
or(parser2: Lexer<V>): Lexer<V> {
return new Lexer(input => {
const res = this.parse(input)
if (res === null) return parser2.parse(input)
return res;
})
}
many(): Lexer<V[]> {
return new Lexer(input => {
let rest = input
let cumul: V[] = []
while (true) {
const res = this.parse(rest)
if (res === null) break;
const [value, newRest] = res;
rest = newRest;
cumul.push(value);
}
return [cumul, rest];
})
}
}
function pchar(predicate: (char: Char) => boolean): Lexer<Char> {
return new Lexer(input => {
if (input.length && predicate(input[0] as Char)) {
return [input[0] as Char, input.slice(1)]
} else {
return null;
}
})
}
let symbol: Lexer<Token> = pchar(input => {
if (input === '+') return true
if (input === '-') return true
if (input === '*') return true
if (input === '/') return true
if (input === '(') return true
if (input === ')') return true
return false
}).map(char => {
if (char === '+') return { type: 'token-plus' } as Token
if (char === '-') return { type: 'token-minus' } as Token
if (char === '*') return { type: 'token-multiply' } as Token
if (char === '/') return { type: 'token-divide' } as Token
if (char === '(') return { type: 'token-lparen' } as Token
if (char === ')') return { type: 'token-rparen' } as Token
throw new Error('不可能走到这里')
})
let whitespace: Lexer<Char> = pchar(input => (input === ' '));
let zero: Lexer<number> = pchar(ch => ch === '0').map(i => 0)
let oneToNine: Lexer<number> = (
pchar(ch => !!(ch.charCodeAt(0) >= 0x31 && ch.charCodeAt(0) <= 0x39))
.map(ch => ch.charCodeAt(0) - 0x30)
)
let zeroToNine: Lexer<number> = (
pchar(ch => !!(ch.charCodeAt(0) >= 0x30 && ch.charCodeAt(0) <= 0x39))
.map(ch => ch.charCodeAt(0) - 0x30)
)
let value: Lexer<Token> = (
zero.or(
oneToNine.and(
zeroToNine.many()
).map(input => {
const [i, ls] = input;
return ls.reduce((acc, cur) => acc * 10 + cur, i)
})
).map(number => ({ type: 'token-value', number }))
);
let tokens: Lexer<Token[]> = (
whitespace.many().and(value.or(symbol).and(whitespace.many()))
.map(input => {
const [_0, [token, _1]] = input;
return token
})
.many()
);
// test !!
(() => {
let input = ' + 123 313 +- /*22';
let res = tokens.parse(input)
console.log(res);
})()
@eczn
Copy link
Author

eczn commented May 2, 2024

https://github.com/eczn/se/blob/master/src/parse/index.ts
⬆️ 回头看一下五年前写 lisp 解释器写的 parser,完全比不上这种 parser combinator 这么优雅正确,现在感觉 C style 的那种 getChar 写法丑到完全看不下去了 😂

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment