Last active
March 4, 2023 20:10
-
-
Save harlanhaskins/1d14f1ab048256d8dfa2f875f893b30d to your computer and use it in GitHub Desktop.
Building a Compiler in Swift with LLVM, Part 1: Introduction and the Lexer
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#if os(macOS) | |
import Darwin | |
#elseif os(Linux) | |
import Glibc | |
#endif | |
enum BinaryOperator: Character { | |
case plus = "+" | |
case minus = "-" | |
case times = "*" | |
case divide = "/" | |
case mod = "%" | |
case equals = "=" | |
} | |
enum Token { | |
case leftParen, rightParen, def, extern, comma, semicolon, `if`, then, `else` | |
case identifier(String) | |
case number(Double) | |
case `operator`(BinaryOperator) | |
} | |
extension Character { | |
var value: Int32 { | |
return Int32(String(self).unicodeScalars.first!.value) | |
} | |
var isSpace: Bool { | |
return isspace(value) != 0 | |
} | |
var isAlphanumeric: Bool { | |
return isalnum(value) != 0 || self == "_" | |
} | |
} | |
class Lexer { | |
let input: String | |
var index: String.Index | |
init(input: String) { | |
self.input = input | |
self.index = input.startIndex | |
} | |
var currentChar: Character? { | |
return index < input.endIndex ? input[index] : nil | |
} | |
func advanceIndex() { | |
input.characters.formIndex(after: &index) | |
} | |
func readIdentifierOrNumber() -> String { | |
var str = "" | |
while let char = currentChar, char.isAlphanumeric || char == "." { | |
str.characters.append(char) | |
advanceIndex() | |
} | |
return str | |
} | |
func advanceToNextToken() -> Token? { | |
// Skip all spaces until a non-space token | |
while let char = currentChar, char.isSpace { | |
advanceIndex() | |
} | |
// If we hit the end of the input, then we're done | |
guard let char = currentChar else { | |
return nil | |
} | |
// Handle single-scalar tokens, like comma, | |
// leftParen, rightParen, and the operators | |
let singleTokMapping: [Character: Token] = [ | |
",": .comma, "(": .leftParen, ")": .rightParen, | |
";": .semicolon, "+": .operator(.plus), "-": .operator(.minus), | |
"*": .operator(.times), "/": .operator(.divide), | |
"%": .operator(.mod), "=": .operator(.equals) | |
] | |
if let tok = singleTokMapping[char] { | |
advanceIndex() | |
return tok | |
} | |
// This is where we parse identifiers or numbers | |
// We're going to use Swift's built-in double parsing | |
// logic here. | |
if char.isAlphanumeric { | |
var str = readIdentifierOrNumber() | |
if let dblVal = Double(str) { | |
return .number(dblVal) | |
} | |
// Look for known tokens, otherwise fall back to | |
// the identifier token | |
switch str { | |
case "def": return .def | |
case "extern": return .extern | |
case "if": return .if | |
case "then": return .then | |
case "else": return .else | |
default: return .identifier(str) | |
} | |
} | |
return nil | |
} | |
func lex() -> [Token] { | |
var toks = [Token]() | |
while let tok = advanceToNextToken() { | |
toks.append(tok) | |
} | |
return toks | |
} | |
} | |
let toks = Lexer(input: "def foo(n) (n * 100.34);").lex() | |
print(toks) |
I refactored your code into this:
enum BinaryOperator: Character {
case plus = "+"
case minus = "-"
case times = "*"
case divide = "/"
case mod = "%"
case equals = "="
}
enum Single: Character {
case leftParen = "("
case rightParen = ")"
case comma = ","
case semicolon = ";"
}
enum Keywords: String {
case def, extern, `if`, then, `else`
}
enum Token {
case keys(Keywords)
case identifier(String)
case number(Double)
case `operator`(BinaryOperator)
case sub(Single)
}
and the implementation of advanceToNextToken
func will be simpler (I think):
func advanceToNextToken() -> Token? {
// Skip all spaces until a non-space token
while let char = currentChar, char.isSpace {
advanceIndex()
}
// If we hit the end of the input, then we're done
guard let char = currentChar else {
return nil
}
// Handle single-scalar tokens, like comma,
// leftParen, rightParen, and the operators
if let tokSig = Single(rawValue: char) {
advanceIndex()
return .sub(tokSig)
}
if let tokOp = BinaryOperator(rawValue: char) {
advanceIndex()
return .operator(tokOp)
}
// This is where we parse identifiers or numbers
// We're going to use Swift's built-in double parsing
// logic here.
if char.isAlphanumeric {
let str = readIdentifierOrNumber()
if let dblVal = Double(str) {
return .number(dblVal)
}
// Look for known tokens, otherwise fall back to
// the identifier token
if let k = Keywords(rawValue: str) {
return .keys(k)
}
return .identifier(str)
}
return nil
}
what do you think?
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
@kdawgwilk I'm so glad you found this useful!