Last active
December 28, 2020 04:36
-
-
Save chriseidhof/4860a12bf36a48f28b55769db1004d95 to your computer and use it in GitHub Desktop.
Faster Parsers
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// | |
// Operators.swift | |
// FastParsing | |
// | |
// Created by Chris Eidhof on 26/12/2016. | |
// Copyright © 2016 objc.io. All rights reserved. | |
// | |
// TODO: give appropriate credit. Many parts were stolen from SwiftParsec. | |
import Foundation | |
extension Character { | |
var isSpace: Bool { // stolen from SwiftParsec | |
switch self { | |
case " ", "\t", "\n", "\r", "\r\n": return true | |
case "\u{000B}", "\u{000C}": return true // Form Feed, vertical tab | |
default: return false | |
} | |
} | |
} | |
// Some quick benchmarks (not at all scientific, just a larger file): | |
// | |
// Using a CharacterView is fast (0.6s). Using a CharacterView with a `position` property (and only mutating the position property) is a bit faster (0.5). | |
// Using an Array is a lot faster (0.4). An ArraySlice (without a position) has similar performance. | |
struct Remainder { // This is basically an ArraySlice, but I'm not sure if ArraySlices will ever change... | |
let original: [Character] | |
var position: Int | |
let endIndex: Int // todo: not sure if we need to cache this? | |
} | |
extension Remainder: Collection { | |
func string() -> String { | |
return String(original) | |
} | |
mutating func next() -> Character? { | |
guard position < endIndex else { return nil } | |
let character = original[position] | |
position = original.index(after: position) | |
return character | |
} | |
init(_ string: String) { | |
original = Array(string.characters) | |
position = original.startIndex | |
endIndex = original.endIndex | |
} | |
typealias Index = Int | |
var startIndex: Index { return position } | |
subscript(index: Index) -> Character { | |
return original[index] | |
} | |
func index(after i: Index) -> Index { | |
return original.index(after: i) | |
} | |
} | |
extension Remainder { | |
mutating func scanCharacter() -> Character? { | |
return scanCharacter(condition: { _ in true }) | |
} | |
mutating func peek() -> Character? { | |
guard position < endIndex else { return nil } | |
return original[position] | |
} | |
mutating func scanCharacter(condition: (Character) -> Bool) -> Character? { | |
guard position < endIndex else { return nil } | |
let character = original[position] | |
guard condition(character) else { | |
return nil | |
} | |
position = index(after: position) | |
return character | |
} | |
} | |
struct Parser<A> { | |
let parse: (inout Remainder) -> A? | |
} | |
struct MyCharacterSet { | |
var characters: Set<Character> | |
} | |
extension Parser { | |
func run(string: String) -> A? { | |
var substring = Remainder(string) | |
let result = parse(&substring) | |
guard substring.isEmpty else { | |
fatalError("Not empty: \(substring.string()). Parsed \(result)") | |
} | |
return result | |
} | |
} | |
extension Parser where A: Equatable { | |
func test(string: String, value: A) { | |
guard let result = run(string: string) else { | |
fatalError("Expected \(value), got nil (\(string))") | |
} | |
assert(result == value) | |
} | |
} | |
extension Parser { | |
init(lazy parser: @escaping() -> Parser) { | |
parse = { (state: inout Remainder) in | |
parser().parse(&state) | |
} | |
} | |
static var fail: Parser<A> { | |
return Parser { _ in nil } | |
} | |
static func character(condition: @escaping (Character) -> Bool) -> Parser<Character> { | |
return Parser<Character> { (input: inout Remainder) in | |
return input.scanCharacter(condition: condition) | |
} | |
} | |
static func character(_ character: Character) -> Parser<Character> { | |
return Parser.character(condition: { $0 == character }) | |
} | |
var many: Parser<[A]> { | |
return Parser<[A]> { (input: inout Remainder) in | |
var result: [A] = [] | |
while let x = self.parse(&input) { | |
result.append(x) | |
} | |
return result | |
} | |
} | |
func manyTill<B>(_ end: Parser<B>) -> Parser<[A]> { | |
return Parser<[A]> { (input: inout Remainder) in | |
var result: [A] = [] | |
while !input.isEmpty { | |
if let _ = end.parse(&input) { | |
return result | |
} else if let value = self.parse(&input) { | |
result.append(value) | |
} | |
} | |
// else | |
return nil | |
} | |
} | |
var many1: Parser<[A]> { | |
return Parser<[A]> { (input: inout Remainder) in | |
guard let value = self.parse(&input) else { return nil } | |
return self.many.parse(&input).map { [value] + $0 } | |
} | |
} | |
func map<B>(_ f: @escaping (A) -> B) -> Parser<B> { | |
return Parser<B> { (input: inout Remainder) in | |
return self.parse(&input).map(f) | |
} | |
} | |
func flatMap<B>(_ f: @escaping (A) -> Parser<B>) -> Parser<B> { | |
return Parser<B> { (input: inout Remainder) in | |
if let result = self.parse(&input) { | |
return f(result).parse(&input) | |
} | |
return nil | |
} | |
} | |
func flatMap<B>(_ f: @escaping (A) -> B?) -> Parser<B> { | |
return Parser<B> { (input: inout Remainder) in | |
if let result = self.parse(&input) { | |
return f(result) | |
} | |
return nil | |
} | |
} | |
// Backtracks if parsing fails | |
var attempt: Parser<A> { | |
return Parser<A>(parse: { (input: inout Remainder) in | |
let original = input.position | |
if let result = self.parse(&input) { return result } | |
input.position = original | |
return nil | |
}) | |
} | |
// Succeeds if parsing fails | |
var noOccurence: Parser<()> { | |
return Parser<()>(parse: { (input: inout Remainder) in | |
var copy = input | |
if self.parse(©) == nil { | |
return () | |
} else { | |
return nil | |
} | |
}) | |
} | |
func onlyIf(peek f: @escaping (Character) -> Bool) -> Parser<A> { | |
return Parser<A>(parse: { (input: inout Remainder) in | |
guard let c = input.peek(), f(c) else { return nil } | |
return self.parse(&input) | |
}) | |
} | |
init(result: A) { | |
self.parse = { _ in return result } | |
} | |
public func otherwise(_ result: A) -> Parser<A> { | |
return self <|> Parser(result: result) | |
} | |
var optional: Parser<A?> { | |
return map { .some($0) }.otherwise(nil) | |
} | |
static var any: Parser<Character> { | |
return character { _ in true } | |
} | |
func between<P1,P2>(_ p1: Parser<P1>, _ p2: Parser<P2>) -> Parser<A> { | |
return Parser { (s: inout Remainder) in | |
return (p1 *> self <* p2).parse(&s) | |
} | |
} | |
static func surrounded(by character: Character) -> Parser<String> { | |
let delimiter = P.character(character) | |
return delimiter *> ({ String($0) } <^> P.any.manyTill(delimiter)) | |
} | |
static func choice(_ parsers: [Parser<A>]) -> Parser<A> { | |
return parsers.reduce(Parser.fail, <|>) | |
} | |
} | |
func <|><A>(l: Parser<A>, r: Parser<A>) -> Parser<A> { | |
return Parser(parse: { (s: inout Remainder) -> A? in | |
if let result = l.parse(&s) { return result } | |
return r.parse(&s) | |
}) | |
} | |
typealias P = Parser<()> // Useful for static methods | |
extension Parser { | |
init<P1, P2>(lift f: @escaping (P1, P2) -> A, _ p1: Parser<P1>, _ p2: Parser<P2>) { | |
self.parse = { (input: inout Remainder) in | |
guard let x = p1.parse(&input), | |
let y = p2.parse(&input) else { return nil } | |
return f(x,y) | |
} | |
} | |
init<P1, P2, P3>(lift f: @escaping (P1, P2, P3) -> A, _ p1: Parser<P1>, _ p2: Parser<P2>, _ p3: Parser<P3>) { | |
self.parse = { (input: inout Remainder) in | |
p1.parse(&input).flatMap { x in | |
p2.parse(&input).flatMap { y in | |
p3.parse(&input).flatMap { z in | |
f(x,y,z) | |
} | |
} | |
} | |
} | |
} | |
init<P1, P2, P3, P4>(lift f: @escaping (P1, P2, P3, P4) -> A, _ p1: Parser<P1>, _ p2: Parser<P2>, _ p3: Parser<P3>, _ p4: Parser<P4>) { | |
self.parse = { (input: inout Remainder) in | |
p1.parse(&input).flatMap { x in | |
p2.parse(&input).flatMap { y in | |
p3.parse(&input).flatMap { z in | |
p4.parse(&input).flatMap { a in | |
f(x,y,z,a) | |
} | |
} | |
} | |
} | |
} | |
} | |
init<P1, P2, P3, P4, P5>(lift f: @escaping (P1, P2, P3, P4, P5) -> A, _ p1: Parser<P1>, _ p2: Parser<P2>, _ p3: Parser<P3>, _ p4: Parser<P4>, _ p5: Parser<P5>) { | |
self.parse = { (input: inout Remainder) in | |
p1.parse(&input).flatMap { x in | |
p2.parse(&input).flatMap { y in | |
p3.parse(&input).flatMap { z in | |
p4.parse(&input).flatMap { a in | |
p5.parse(&input).flatMap { b in | |
f(x,y,z,a,b) | |
} | |
} | |
} | |
} | |
} | |
} | |
} | |
} | |
precedencegroup SequencePrecedence { | |
associativity: left | |
higherThan: ChoicePrecedence | |
} | |
precedencegroup ChoicePrecedence { | |
associativity: left | |
higherThan: BindPrecedence | |
} | |
precedencegroup BindPrecedence { | |
associativity: left | |
// higherThan: LabelPrecedence | |
} | |
infix operator <^>: SequencePrecedence | |
infix operator <*>: SequencePrecedence | |
infix operator *>: SequencePrecedence | |
infix operator <*: SequencePrecedence | |
infix operator <|>: ChoicePrecedence | |
func *><A,B>(p1: Parser<A>, p2: Parser<B>) -> Parser<B> { | |
return Parser(parse: { (s: inout Remainder) in | |
guard let _ = p1.parse(&s), | |
let result = p2.parse(&s) else { return nil } | |
return result | |
}) | |
} | |
func <*<A,B>(p1: Parser<A>, p2: Parser<B>) -> Parser<A> { | |
return Parser(parse: { (s: inout Remainder) -> A? in | |
guard let result = p1.parse(&s), | |
let _ = p2.parse(&s) else { return nil } | |
return result | |
}) | |
//return Parser<A>(lift: { x, _ in x }, p1, p2) | |
} | |
func <^><A,B>(f: @escaping (A) -> B, p2: Parser<A>) -> Parser<B> { | |
return p2.map(f) | |
} | |
// Foundation-only | |
import Foundation | |
extension CharacterSet { | |
func contains(_ char: Character) -> Bool { | |
let scalars = String(char).unicodeScalars | |
guard scalars.count == 1 else { return false } | |
return contains(scalars.first!) | |
} | |
var parse: Parser<Character> { | |
return P.character(condition: self.contains) | |
} | |
} | |
extension Parser { | |
static var digit: Parser<Character> { | |
return CharacterSet.decimalDigits.parse | |
} | |
static var alphaNumeric: Parser<Character> { | |
return CharacterSet.alphanumerics.parse | |
} | |
static var tab: Parser<Character> { | |
return P.character("\t") | |
} | |
static var space: Parser<Character> { | |
return P.character { $0.isSpace } | |
} | |
static var eof: Parser<()> { | |
return Parser<()>{ (stream: inout Remainder) in | |
if stream.isEmpty { | |
return () | |
} else { | |
return nil | |
} | |
} | |
} | |
static var newLine: Parser<Character> { | |
return character("\n") | |
} | |
static var openingParen: Parser<Character> { | |
return character("(") | |
} | |
static var closingParen: Parser<Character> { | |
return character(")") | |
} | |
static func oneOf<C: Collection>(_ collection: C) -> Parser<Character> where C.Iterator.Element == Character { | |
return character { collection.contains($0) } | |
} | |
} | |
// Ledger | |
let naturalString = P.digit.many1.map { String($0) } | |
let naturalWithCommasString = (P.digit <|> P.character(",")).many1.map( { digitsAndCommas in digitsAndCommas.filter { $0 != "," } }).map { String($0) } | |
let natural = naturalString.map { Int($0)! } | |
let unsignedDouble = Parser<LedgerDouble>(lift: { integerPart, fractionalPart in | |
guard let fraction = fractionalPart else { return LedgerDouble(integerPart)! } | |
return LedgerDouble("\(integerPart).\(fraction)")! | |
}, naturalWithCommasString, (P.character(".") *> naturalString).optional) | |
let double = Parser<LedgerDouble>(lift: { $0 == nil ? $1 : -$1 }, P.character("-").optional, unsignedDouble) | |
let noNewline = P.character { $0 != "\n" } | |
let spaceWithoutNewline = P.character(" ") <|> P.tab | |
let noSpace = P.character { !$0.isSpace } | |
let singleSpace = (P.character(" ") <* P.space.noOccurence).attempt | |
let newlineAndSpacedNewlines = P.newLine *> P.space.many | |
extension Parser { | |
var lexeme: Parser<A> { | |
return self <* spaceWithoutNewline.many | |
} | |
} | |
extension Parser { | |
static func string(_ s: String) -> Parser<String> { | |
return Parser<String> { (remainder: inout Remainder) in | |
for c in s.characters { | |
if remainder.scanCharacter(condition: { $0 == c }) == nil { | |
return nil | |
} | |
} | |
return s | |
} | |
} | |
} | |
// The part below is stolen from SwiftParsec! | |
/// This enumeration specifies the associativity of operators: right, left or none. | |
public enum Associativity { | |
case right, left, none | |
} | |
enum Operator<Result> { | |
/// Infix operator and associativity. | |
case infix(Parser<(Result, Result) -> Result>, Associativity) | |
/// Prefix operator. | |
case prefix(Parser<(Result) -> Result>) | |
/// Postfix operator. | |
case postfix(Parser<(Result) -> Result>) | |
} | |
struct OperatorTable<Result>: RangeReplaceableCollection, ExpressibleByArrayLiteral { | |
/// Represents a valid position in the operator table. | |
public typealias Index = Int | |
/// Operator table's generator. | |
public typealias Iterator = IndexingIterator<OperatorTable> | |
/// Element type of the operator table. | |
public typealias Element = [Operator<Result>] | |
/// The position of the first element. | |
public let startIndex = 0 | |
/// The operator table's "past the end" position. | |
public var endIndex: Int { return table.count } | |
// Backing store. | |
private var table: [Element] | |
/// Create an instance initialized with elements. | |
/// | |
/// - parameter arrayLiteral: Arrays of `Operator`. | |
public init(arrayLiteral elements: Element...) { | |
table = elements | |
} | |
/// Create an empty instance. | |
public init() { table = [] } | |
/// Returns the position immediately after i. | |
/// | |
/// - SeeAlso: `IndexableBase` protocol. | |
public func index(after i: OperatorTable.Index) -> OperatorTable.Index { | |
return table.index(after: i) | |
} | |
/// Build an expression parser for terms returned by `combined` with operators from `self`, taking the associativity and precedence specified in `self` into account. Prefix and postfix operators of the same precedence can only occur once (i.e. `--2` is not allowed if `-` is prefix negate). Prefix and postfix operators of the same precedence associate to the left (i.e. if `++` is postfix increment, than `-2++` equals `-1`, not `-3`). | |
/// | |
/// It takes care of all the complexity involved in building expression parser. Here is an example of an expression parser that handles prefix signs, postfix increment and basic arithmetic: | |
/// | |
/// func binary(name: String, function: (Int, Int) -> Int, assoc: Associativity) -> Operator<String, (), Int> { | |
/// | |
/// let opParser = StringParser.string(name) *> GenericParser(result: function) | |
/// return .Infix(opParser, assoc) | |
/// | |
/// } | |
/// | |
/// func prefix(name: String, function: Int -> Int) -> Operator<String, (), Int> { | |
/// | |
/// let opParser = StringParser.string(name) *> GenericParser(result: function) | |
/// return .Prefix(opParser) | |
/// | |
/// } | |
/// | |
/// func postfix(name: String, function: Int -> Int) -> Operator<String, (), Int> { | |
/// | |
/// let opParser = StringParser.string(name) *> GenericParser(result: function) | |
/// return .Postfix(opParser.attempt) | |
/// | |
/// } | |
/// | |
/// let opTable: OperatorTable<String, (), Int> = [ | |
/// | |
/// [ | |
/// prefix("-", function: -), | |
/// prefix("+", function: { $0 }) | |
/// ], | |
/// [ | |
/// binary("^", function: power, assoc: .Right) | |
/// ], | |
/// [ | |
/// binary("*", function: *, assoc: .Left), | |
/// binary("/", function: /, assoc: .Left) | |
/// ], | |
/// [ | |
/// binary("+", function: +, assoc: .Left), | |
/// binary("-", function: -, assoc: .Left) | |
/// ] | |
/// | |
/// ] | |
/// | |
/// let openingParen = StringParser.character("(") | |
/// let closingParen = StringParser.character(")") | |
/// let decimal = GenericTokenParser<()>.decimal | |
/// | |
/// let expression = opTable.expressionParser { expression in | |
/// | |
/// expression.between(openingParen, closingParen) <|> | |
/// decimal <?> "simple expression" | |
/// | |
/// } <?> "expression" | |
/// | |
/// - parameter combine: A function receiving a 'simple expression' as parameter that can be nested in other expressions. | |
/// - returns: An expression parser for terms returned by `combined` with operators from `self`. | |
/// - SeeAlso: P.recursive(combine: GenericParser -> GenericParser) -> GenericParser | |
public func makeExpressionParser(_ combine: (_ expression: Parser<Result>) -> Parser<Result>) -> Parser<Result> { | |
var term: Parser<Result>! | |
let lazyTerm = Parser<Result>(lazy: { term }) | |
let expr = reduce(lazyTerm) { buildParser($0, operators: $1) } | |
term = combine(expr) | |
return expr | |
} | |
private typealias InfixOperatorParser = Parser<(Result, Result) -> Result> | |
private typealias PrefixOperatorParser = Parser<(Result) -> Result> | |
private typealias PostfixOperatorParser = Parser<(Result) -> Result> | |
private typealias OperatorsTuple = (right: [InfixOperatorParser], left: [InfixOperatorParser], none: [InfixOperatorParser], prefix: [PrefixOperatorParser], postfix: [PostfixOperatorParser]) | |
private func buildParser(_ term: Parser<Result>, operators: [Operator<Result>]) -> Parser<Result> { | |
let ops: OperatorsTuple = operators.reduce(([], [], [], [], []), splitOperators) | |
let rightAssocOp = Parser.choice(ops.right) | |
let leftAssocOp = Parser.choice(ops.left) | |
let nonAssocOp = Parser.choice(ops.none) | |
let prefixOp = Parser.choice(ops.prefix) | |
let postfixOp = Parser.choice(ops.postfix) | |
let rightAssocMsg = NSLocalizedString("right", comment: "Right-associative parser.") | |
let ambigiousRight = ambigious(rightAssocOp, assoc: rightAssocMsg) | |
let leftAssocMsg = NSLocalizedString("left", comment: "Left-associative parser.") | |
let ambigiousLeft = ambigious(leftAssocOp, assoc: leftAssocMsg) | |
let nonAssocMsg = NSLocalizedString("non", comment: "Non-associative parser.") | |
let ambigiousNon = ambigious(rightAssocOp, assoc: nonAssocMsg) | |
let prefixParser = prefixOp <|> Parser(result: { $0 }) | |
let postfixParser = postfixOp <|> Parser(result: { $0 }) | |
let termParser = prefixParser.flatMap { pre in | |
term.flatMap { t in | |
postfixParser.flatMap { post in | |
Parser(result: post(pre(t))) | |
} | |
} | |
} | |
func rightAssocParser(_ left: Result) -> Parser<Result> { | |
let rightTerm = termParser.flatMap { rightAssocParser1($0) } | |
let apply = rightAssocOp.flatMap { f in | |
rightTerm.flatMap { right in Parser(result: f(left, right)) } | |
} | |
return apply <|> ambigiousLeft <|> ambigiousNon | |
} | |
func rightAssocParser1(_ right: Result) -> Parser<Result> { | |
return rightAssocParser(right) <|> Parser(result: right) | |
} | |
func leftAssocParser(_ left: Result) -> Parser<Result> { | |
let apply = leftAssocOp.flatMap { f in | |
termParser.flatMap { right in | |
leftAssocParser1(f(left, right)) | |
} | |
} | |
return apply <|> ambigiousRight <|> ambigiousNon | |
} | |
func leftAssocParser1(_ right: Result) -> Parser<Result> { | |
return leftAssocParser(right) <|> Parser(result: right) | |
} | |
func nonAssocParser(_ left: Result) -> Parser<Result> { | |
return nonAssocOp.flatMap { f in | |
termParser.flatMap { right in | |
ambigiousRight <|> ambigiousLeft <|> ambigiousNon <|> | |
Parser(result: f(left, right)) | |
} | |
} | |
} | |
return termParser.flatMap { t in | |
rightAssocParser(t) <|> leftAssocParser(t) <|> nonAssocParser(t) <|> | |
Parser(result: t) // <?> NSLocalizedString("operator", comment: "Expression parser label.") | |
} | |
} | |
private func splitOperators(operators: OperatorsTuple, op: Operator<Result>) -> OperatorsTuple { | |
var ops = operators | |
switch op { | |
case .infix(let parser, let assoc): | |
switch assoc { | |
case .none: | |
var n = ops.none | |
n.append(parser) | |
ops.none = n | |
case .left: | |
var l = ops.left | |
l.append(parser) | |
ops.left = l | |
case .right: | |
var r = ops.right | |
r.append(parser) | |
ops.right = r | |
} | |
case .prefix(let parser): | |
var pre = ops.prefix | |
pre.append(parser) | |
ops.prefix = pre | |
case .postfix(let parser): | |
var post = ops.postfix | |
post.append(parser) | |
ops.postfix = post | |
} | |
return ops | |
} | |
private func ambigious(_ op: InfixOperatorParser, assoc: String) -> Parser<Result> { | |
let msg = NSLocalizedString("ambiguous use of a %@ associative operator", comment: "Expression parser.") | |
let localizedMsg = String.localizedStringWithFormat(msg, assoc as CVarArg) | |
return (op *> Parser.fail).attempt | |
} | |
/// Replace the given subRange of elements with newElements. | |
/// | |
/// - parameters: | |
/// - subRange: Range of elements to replace. | |
/// - newElements: New elements replacing the previous elements contained in `subRange`. | |
public mutating func replaceSubrange<C: Collection>(_ subrange: Range<Index>, with newElements: C) where C.Iterator.Element == Iterator.Element { | |
table.replaceSubrange(subrange, with: newElements) | |
} | |
public subscript(position: Index) -> Element { return table[position] } | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Really really useful addition:
then you can do
p.manyTill(q.peek)
and the parser won't consume the characters that q needs to consume