Skip to content

Instantly share code, notes, and snippets.

@chriseidhof
Last active December 28, 2020 04:36
Show Gist options
  • Save chriseidhof/4860a12bf36a48f28b55769db1004d95 to your computer and use it in GitHub Desktop.
Save chriseidhof/4860a12bf36a48f28b55769db1004d95 to your computer and use it in GitHub Desktop.
Faster Parsers
//
// Operators.swift
// FastParsing
//
// Created by Chris Eidhof on 26/12/2016.
// Copyright © 2016 objc.io. All rights reserved.
//
// TODO: give appropriate credit. Many parts were stolen from SwiftParsec.
import Foundation
extension Character {
var isSpace: Bool { // stolen from SwiftParsec
switch self {
case " ", "\t", "\n", "\r", "\r\n": return true
case "\u{000B}", "\u{000C}": return true // Form Feed, vertical tab
default: return false
}
}
}
// Some quick benchmarks (not at all scientific, just a larger file):
//
// Using a CharacterView is fast (0.6s). Using a CharacterView with a `position` property (and only mutating the position property) is a bit faster (0.5).
// Using an Array is a lot faster (0.4). An ArraySlice (without a position) has similar performance.
struct Remainder { // This is basically an ArraySlice, but I'm not sure if ArraySlices will ever change...
let original: [Character]
var position: Int
let endIndex: Int // todo: not sure if we need to cache this?
}
extension Remainder: Collection {
func string() -> String {
return String(original)
}
mutating func next() -> Character? {
guard position < endIndex else { return nil }
let character = original[position]
position = original.index(after: position)
return character
}
init(_ string: String) {
original = Array(string.characters)
position = original.startIndex
endIndex = original.endIndex
}
typealias Index = Int
var startIndex: Index { return position }
subscript(index: Index) -> Character {
return original[index]
}
func index(after i: Index) -> Index {
return original.index(after: i)
}
}
extension Remainder {
mutating func scanCharacter() -> Character? {
return scanCharacter(condition: { _ in true })
}
mutating func peek() -> Character? {
guard position < endIndex else { return nil }
return original[position]
}
mutating func scanCharacter(condition: (Character) -> Bool) -> Character? {
guard position < endIndex else { return nil }
let character = original[position]
guard condition(character) else {
return nil
}
position = index(after: position)
return character
}
}
struct Parser<A> {
let parse: (inout Remainder) -> A?
}
struct MyCharacterSet {
var characters: Set<Character>
}
extension Parser {
func run(string: String) -> A? {
var substring = Remainder(string)
let result = parse(&substring)
guard substring.isEmpty else {
fatalError("Not empty: \(substring.string()). Parsed \(result)")
}
return result
}
}
extension Parser where A: Equatable {
func test(string: String, value: A) {
guard let result = run(string: string) else {
fatalError("Expected \(value), got nil (\(string))")
}
assert(result == value)
}
}
extension Parser {
init(lazy parser: @escaping() -> Parser) {
parse = { (state: inout Remainder) in
parser().parse(&state)
}
}
static var fail: Parser<A> {
return Parser { _ in nil }
}
static func character(condition: @escaping (Character) -> Bool) -> Parser<Character> {
return Parser<Character> { (input: inout Remainder) in
return input.scanCharacter(condition: condition)
}
}
static func character(_ character: Character) -> Parser<Character> {
return Parser.character(condition: { $0 == character })
}
var many: Parser<[A]> {
return Parser<[A]> { (input: inout Remainder) in
var result: [A] = []
while let x = self.parse(&input) {
result.append(x)
}
return result
}
}
func manyTill<B>(_ end: Parser<B>) -> Parser<[A]> {
return Parser<[A]> { (input: inout Remainder) in
var result: [A] = []
while !input.isEmpty {
if let _ = end.parse(&input) {
return result
} else if let value = self.parse(&input) {
result.append(value)
}
}
// else
return nil
}
}
var many1: Parser<[A]> {
return Parser<[A]> { (input: inout Remainder) in
guard let value = self.parse(&input) else { return nil }
return self.many.parse(&input).map { [value] + $0 }
}
}
func map<B>(_ f: @escaping (A) -> B) -> Parser<B> {
return Parser<B> { (input: inout Remainder) in
return self.parse(&input).map(f)
}
}
func flatMap<B>(_ f: @escaping (A) -> Parser<B>) -> Parser<B> {
return Parser<B> { (input: inout Remainder) in
if let result = self.parse(&input) {
return f(result).parse(&input)
}
return nil
}
}
func flatMap<B>(_ f: @escaping (A) -> B?) -> Parser<B> {
return Parser<B> { (input: inout Remainder) in
if let result = self.parse(&input) {
return f(result)
}
return nil
}
}
// Backtracks if parsing fails
var attempt: Parser<A> {
return Parser<A>(parse: { (input: inout Remainder) in
let original = input.position
if let result = self.parse(&input) { return result }
input.position = original
return nil
})
}
// Succeeds if parsing fails
var noOccurence: Parser<()> {
return Parser<()>(parse: { (input: inout Remainder) in
var copy = input
if self.parse(&copy) == nil {
return ()
} else {
return nil
}
})
}
func onlyIf(peek f: @escaping (Character) -> Bool) -> Parser<A> {
return Parser<A>(parse: { (input: inout Remainder) in
guard let c = input.peek(), f(c) else { return nil }
return self.parse(&input)
})
}
init(result: A) {
self.parse = { _ in return result }
}
public func otherwise(_ result: A) -> Parser<A> {
return self <|> Parser(result: result)
}
var optional: Parser<A?> {
return map { .some($0) }.otherwise(nil)
}
static var any: Parser<Character> {
return character { _ in true }
}
func between<P1,P2>(_ p1: Parser<P1>, _ p2: Parser<P2>) -> Parser<A> {
return Parser { (s: inout Remainder) in
return (p1 *> self <* p2).parse(&s)
}
}
static func surrounded(by character: Character) -> Parser<String> {
let delimiter = P.character(character)
return delimiter *> ({ String($0) } <^> P.any.manyTill(delimiter))
}
static func choice(_ parsers: [Parser<A>]) -> Parser<A> {
return parsers.reduce(Parser.fail, <|>)
}
}
func <|><A>(l: Parser<A>, r: Parser<A>) -> Parser<A> {
return Parser(parse: { (s: inout Remainder) -> A? in
if let result = l.parse(&s) { return result }
return r.parse(&s)
})
}
typealias P = Parser<()> // Useful for static methods
extension Parser {
init<P1, P2>(lift f: @escaping (P1, P2) -> A, _ p1: Parser<P1>, _ p2: Parser<P2>) {
self.parse = { (input: inout Remainder) in
guard let x = p1.parse(&input),
let y = p2.parse(&input) else { return nil }
return f(x,y)
}
}
init<P1, P2, P3>(lift f: @escaping (P1, P2, P3) -> A, _ p1: Parser<P1>, _ p2: Parser<P2>, _ p3: Parser<P3>) {
self.parse = { (input: inout Remainder) in
p1.parse(&input).flatMap { x in
p2.parse(&input).flatMap { y in
p3.parse(&input).flatMap { z in
f(x,y,z)
}
}
}
}
}
init<P1, P2, P3, P4>(lift f: @escaping (P1, P2, P3, P4) -> A, _ p1: Parser<P1>, _ p2: Parser<P2>, _ p3: Parser<P3>, _ p4: Parser<P4>) {
self.parse = { (input: inout Remainder) in
p1.parse(&input).flatMap { x in
p2.parse(&input).flatMap { y in
p3.parse(&input).flatMap { z in
p4.parse(&input).flatMap { a in
f(x,y,z,a)
}
}
}
}
}
}
init<P1, P2, P3, P4, P5>(lift f: @escaping (P1, P2, P3, P4, P5) -> A, _ p1: Parser<P1>, _ p2: Parser<P2>, _ p3: Parser<P3>, _ p4: Parser<P4>, _ p5: Parser<P5>) {
self.parse = { (input: inout Remainder) in
p1.parse(&input).flatMap { x in
p2.parse(&input).flatMap { y in
p3.parse(&input).flatMap { z in
p4.parse(&input).flatMap { a in
p5.parse(&input).flatMap { b in
f(x,y,z,a,b)
}
}
}
}
}
}
}
}
precedencegroup SequencePrecedence {
associativity: left
higherThan: ChoicePrecedence
}
precedencegroup ChoicePrecedence {
associativity: left
higherThan: BindPrecedence
}
precedencegroup BindPrecedence {
associativity: left
// higherThan: LabelPrecedence
}
infix operator <^>: SequencePrecedence
infix operator <*>: SequencePrecedence
infix operator *>: SequencePrecedence
infix operator <*: SequencePrecedence
infix operator <|>: ChoicePrecedence
func *><A,B>(p1: Parser<A>, p2: Parser<B>) -> Parser<B> {
return Parser(parse: { (s: inout Remainder) in
guard let _ = p1.parse(&s),
let result = p2.parse(&s) else { return nil }
return result
})
}
func <*<A,B>(p1: Parser<A>, p2: Parser<B>) -> Parser<A> {
return Parser(parse: { (s: inout Remainder) -> A? in
guard let result = p1.parse(&s),
let _ = p2.parse(&s) else { return nil }
return result
})
//return Parser<A>(lift: { x, _ in x }, p1, p2)
}
func <^><A,B>(f: @escaping (A) -> B, p2: Parser<A>) -> Parser<B> {
return p2.map(f)
}
// Foundation-only
import Foundation
extension CharacterSet {
func contains(_ char: Character) -> Bool {
let scalars = String(char).unicodeScalars
guard scalars.count == 1 else { return false }
return contains(scalars.first!)
}
var parse: Parser<Character> {
return P.character(condition: self.contains)
}
}
extension Parser {
static var digit: Parser<Character> {
return CharacterSet.decimalDigits.parse
}
static var alphaNumeric: Parser<Character> {
return CharacterSet.alphanumerics.parse
}
static var tab: Parser<Character> {
return P.character("\t")
}
static var space: Parser<Character> {
return P.character { $0.isSpace }
}
static var eof: Parser<()> {
return Parser<()>{ (stream: inout Remainder) in
if stream.isEmpty {
return ()
} else {
return nil
}
}
}
static var newLine: Parser<Character> {
return character("\n")
}
static var openingParen: Parser<Character> {
return character("(")
}
static var closingParen: Parser<Character> {
return character(")")
}
static func oneOf<C: Collection>(_ collection: C) -> Parser<Character> where C.Iterator.Element == Character {
return character { collection.contains($0) }
}
}
// Ledger
let naturalString = P.digit.many1.map { String($0) }
let naturalWithCommasString = (P.digit <|> P.character(",")).many1.map( { digitsAndCommas in digitsAndCommas.filter { $0 != "," } }).map { String($0) }
let natural = naturalString.map { Int($0)! }
let unsignedDouble = Parser<LedgerDouble>(lift: { integerPart, fractionalPart in
guard let fraction = fractionalPart else { return LedgerDouble(integerPart)! }
return LedgerDouble("\(integerPart).\(fraction)")!
}, naturalWithCommasString, (P.character(".") *> naturalString).optional)
let double = Parser<LedgerDouble>(lift: { $0 == nil ? $1 : -$1 }, P.character("-").optional, unsignedDouble)
let noNewline = P.character { $0 != "\n" }
let spaceWithoutNewline = P.character(" ") <|> P.tab
let noSpace = P.character { !$0.isSpace }
let singleSpace = (P.character(" ") <* P.space.noOccurence).attempt
let newlineAndSpacedNewlines = P.newLine *> P.space.many
extension Parser {
var lexeme: Parser<A> {
return self <* spaceWithoutNewline.many
}
}
extension Parser {
static func string(_ s: String) -> Parser<String> {
return Parser<String> { (remainder: inout Remainder) in
for c in s.characters {
if remainder.scanCharacter(condition: { $0 == c }) == nil {
return nil
}
}
return s
}
}
}
// The part below is stolen from SwiftParsec!
/// This enumeration specifies the associativity of operators: right, left or none.
public enum Associativity {
case right, left, none
}
enum Operator<Result> {
/// Infix operator and associativity.
case infix(Parser<(Result, Result) -> Result>, Associativity)
/// Prefix operator.
case prefix(Parser<(Result) -> Result>)
/// Postfix operator.
case postfix(Parser<(Result) -> Result>)
}
struct OperatorTable<Result>: RangeReplaceableCollection, ExpressibleByArrayLiteral {
/// Represents a valid position in the operator table.
public typealias Index = Int
/// Operator table's generator.
public typealias Iterator = IndexingIterator<OperatorTable>
/// Element type of the operator table.
public typealias Element = [Operator<Result>]
/// The position of the first element.
public let startIndex = 0
/// The operator table's "past the end" position.
public var endIndex: Int { return table.count }
// Backing store.
private var table: [Element]
/// Create an instance initialized with elements.
///
/// - parameter arrayLiteral: Arrays of `Operator`.
public init(arrayLiteral elements: Element...) {
table = elements
}
/// Create an empty instance.
public init() { table = [] }
/// Returns the position immediately after i.
///
/// - SeeAlso: `IndexableBase` protocol.
public func index(after i: OperatorTable.Index) -> OperatorTable.Index {
return table.index(after: i)
}
/// Build an expression parser for terms returned by `combined` with operators from `self`, taking the associativity and precedence specified in `self` into account. Prefix and postfix operators of the same precedence can only occur once (i.e. `--2` is not allowed if `-` is prefix negate). Prefix and postfix operators of the same precedence associate to the left (i.e. if `++` is postfix increment, than `-2++` equals `-1`, not `-3`).
///
/// It takes care of all the complexity involved in building expression parser. Here is an example of an expression parser that handles prefix signs, postfix increment and basic arithmetic:
///
/// func binary(name: String, function: (Int, Int) -> Int, assoc: Associativity) -> Operator<String, (), Int> {
///
/// let opParser = StringParser.string(name) *> GenericParser(result: function)
/// return .Infix(opParser, assoc)
///
/// }
///
/// func prefix(name: String, function: Int -> Int) -> Operator<String, (), Int> {
///
/// let opParser = StringParser.string(name) *> GenericParser(result: function)
/// return .Prefix(opParser)
///
/// }
///
/// func postfix(name: String, function: Int -> Int) -> Operator<String, (), Int> {
///
/// let opParser = StringParser.string(name) *> GenericParser(result: function)
/// return .Postfix(opParser.attempt)
///
/// }
///
/// let opTable: OperatorTable<String, (), Int> = [
///
/// [
/// prefix("-", function: -),
/// prefix("+", function: { $0 })
/// ],
/// [
/// binary("^", function: power, assoc: .Right)
/// ],
/// [
/// binary("*", function: *, assoc: .Left),
/// binary("/", function: /, assoc: .Left)
/// ],
/// [
/// binary("+", function: +, assoc: .Left),
/// binary("-", function: -, assoc: .Left)
/// ]
///
/// ]
///
/// let openingParen = StringParser.character("(")
/// let closingParen = StringParser.character(")")
/// let decimal = GenericTokenParser<()>.decimal
///
/// let expression = opTable.expressionParser { expression in
///
/// expression.between(openingParen, closingParen) <|>
/// decimal <?> "simple expression"
///
/// } <?> "expression"
///
/// - parameter combine: A function receiving a 'simple expression' as parameter that can be nested in other expressions.
/// - returns: An expression parser for terms returned by `combined` with operators from `self`.
/// - SeeAlso: P.recursive(combine: GenericParser -> GenericParser) -> GenericParser
public func makeExpressionParser(_ combine: (_ expression: Parser<Result>) -> Parser<Result>) -> Parser<Result> {
var term: Parser<Result>!
let lazyTerm = Parser<Result>(lazy: { term })
let expr = reduce(lazyTerm) { buildParser($0, operators: $1) }
term = combine(expr)
return expr
}
private typealias InfixOperatorParser = Parser<(Result, Result) -> Result>
private typealias PrefixOperatorParser = Parser<(Result) -> Result>
private typealias PostfixOperatorParser = Parser<(Result) -> Result>
private typealias OperatorsTuple = (right: [InfixOperatorParser], left: [InfixOperatorParser], none: [InfixOperatorParser], prefix: [PrefixOperatorParser], postfix: [PostfixOperatorParser])
private func buildParser(_ term: Parser<Result>, operators: [Operator<Result>]) -> Parser<Result> {
let ops: OperatorsTuple = operators.reduce(([], [], [], [], []), splitOperators)
let rightAssocOp = Parser.choice(ops.right)
let leftAssocOp = Parser.choice(ops.left)
let nonAssocOp = Parser.choice(ops.none)
let prefixOp = Parser.choice(ops.prefix)
let postfixOp = Parser.choice(ops.postfix)
let rightAssocMsg = NSLocalizedString("right", comment: "Right-associative parser.")
let ambigiousRight = ambigious(rightAssocOp, assoc: rightAssocMsg)
let leftAssocMsg = NSLocalizedString("left", comment: "Left-associative parser.")
let ambigiousLeft = ambigious(leftAssocOp, assoc: leftAssocMsg)
let nonAssocMsg = NSLocalizedString("non", comment: "Non-associative parser.")
let ambigiousNon = ambigious(rightAssocOp, assoc: nonAssocMsg)
let prefixParser = prefixOp <|> Parser(result: { $0 })
let postfixParser = postfixOp <|> Parser(result: { $0 })
let termParser = prefixParser.flatMap { pre in
term.flatMap { t in
postfixParser.flatMap { post in
Parser(result: post(pre(t)))
}
}
}
func rightAssocParser(_ left: Result) -> Parser<Result> {
let rightTerm = termParser.flatMap { rightAssocParser1($0) }
let apply = rightAssocOp.flatMap { f in
rightTerm.flatMap { right in Parser(result: f(left, right)) }
}
return apply <|> ambigiousLeft <|> ambigiousNon
}
func rightAssocParser1(_ right: Result) -> Parser<Result> {
return rightAssocParser(right) <|> Parser(result: right)
}
func leftAssocParser(_ left: Result) -> Parser<Result> {
let apply = leftAssocOp.flatMap { f in
termParser.flatMap { right in
leftAssocParser1(f(left, right))
}
}
return apply <|> ambigiousRight <|> ambigiousNon
}
func leftAssocParser1(_ right: Result) -> Parser<Result> {
return leftAssocParser(right) <|> Parser(result: right)
}
func nonAssocParser(_ left: Result) -> Parser<Result> {
return nonAssocOp.flatMap { f in
termParser.flatMap { right in
ambigiousRight <|> ambigiousLeft <|> ambigiousNon <|>
Parser(result: f(left, right))
}
}
}
return termParser.flatMap { t in
rightAssocParser(t) <|> leftAssocParser(t) <|> nonAssocParser(t) <|>
Parser(result: t) // <?> NSLocalizedString("operator", comment: "Expression parser label.")
}
}
private func splitOperators(operators: OperatorsTuple, op: Operator<Result>) -> OperatorsTuple {
var ops = operators
switch op {
case .infix(let parser, let assoc):
switch assoc {
case .none:
var n = ops.none
n.append(parser)
ops.none = n
case .left:
var l = ops.left
l.append(parser)
ops.left = l
case .right:
var r = ops.right
r.append(parser)
ops.right = r
}
case .prefix(let parser):
var pre = ops.prefix
pre.append(parser)
ops.prefix = pre
case .postfix(let parser):
var post = ops.postfix
post.append(parser)
ops.postfix = post
}
return ops
}
private func ambigious(_ op: InfixOperatorParser, assoc: String) -> Parser<Result> {
let msg = NSLocalizedString("ambiguous use of a %@ associative operator", comment: "Expression parser.")
let localizedMsg = String.localizedStringWithFormat(msg, assoc as CVarArg)
return (op *> Parser.fail).attempt
}
/// Replace the given subRange of elements with newElements.
///
/// - parameters:
/// - subRange: Range of elements to replace.
/// - newElements: New elements replacing the previous elements contained in `subRange`.
public mutating func replaceSubrange<C: Collection>(_ subrange: Range<Index>, with newElements: C) where C.Iterator.Element == Iterator.Element {
table.replaceSubrange(subrange, with: newElements)
}
public subscript(position: Index) -> Element { return table[position] }
}
@bkase
Copy link

bkase commented Oct 13, 2017

Really really useful addition:

extension Parser {
    var peek: Parser {
        return Parser { input in
            let start = input.position
            defer { input.position = start }
            return self.parse(&input)
        }
    }
}

then you can do p.manyTill(q.peek) and the parser won't consume the characters that q needs to consume

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment