Last active
March 4, 2023 20:10
-
-
Save harlanhaskins/1d14f1ab048256d8dfa2f875f893b30d to your computer and use it in GitHub Desktop.
Building a Compiler in Swift with LLVM, Part 1: Introduction and the Lexer
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#if os(macOS) | |
import Darwin | |
#elseif os(Linux) | |
import Glibc | |
#endif | |
enum BinaryOperator: Character { | |
case plus = "+" | |
case minus = "-" | |
case times = "*" | |
case divide = "/" | |
case mod = "%" | |
case equals = "=" | |
} | |
enum Token { | |
case leftParen, rightParen, def, extern, comma, semicolon, `if`, then, `else` | |
case identifier(String) | |
case number(Double) | |
case `operator`(BinaryOperator) | |
} | |
extension Character { | |
var value: Int32 { | |
return Int32(String(self).unicodeScalars.first!.value) | |
} | |
var isSpace: Bool { | |
return isspace(value) != 0 | |
} | |
var isAlphanumeric: Bool { | |
return isalnum(value) != 0 || self == "_" | |
} | |
} | |
class Lexer { | |
let input: String | |
var index: String.Index | |
init(input: String) { | |
self.input = input | |
self.index = input.startIndex | |
} | |
var currentChar: Character? { | |
return index < input.endIndex ? input[index] : nil | |
} | |
func advanceIndex() { | |
input.characters.formIndex(after: &index) | |
} | |
func readIdentifierOrNumber() -> String { | |
var str = "" | |
while let char = currentChar, char.isAlphanumeric || char == "." { | |
str.characters.append(char) | |
advanceIndex() | |
} | |
return str | |
} | |
func advanceToNextToken() -> Token? { | |
// Skip all spaces until a non-space token | |
while let char = currentChar, char.isSpace { | |
advanceIndex() | |
} | |
// If we hit the end of the input, then we're done | |
guard let char = currentChar else { | |
return nil | |
} | |
// Handle single-scalar tokens, like comma, | |
// leftParen, rightParen, and the operators | |
let singleTokMapping: [Character: Token] = [ | |
",": .comma, "(": .leftParen, ")": .rightParen, | |
";": .semicolon, "+": .operator(.plus), "-": .operator(.minus), | |
"*": .operator(.times), "/": .operator(.divide), | |
"%": .operator(.mod), "=": .operator(.equals) | |
] | |
if let tok = singleTokMapping[char] { | |
advanceIndex() | |
return tok | |
} | |
// This is where we parse identifiers or numbers | |
// We're going to use Swift's built-in double parsing | |
// logic here. | |
if char.isAlphanumeric { | |
var str = readIdentifierOrNumber() | |
if let dblVal = Double(str) { | |
return .number(dblVal) | |
} | |
// Look for known tokens, otherwise fall back to | |
// the identifier token | |
switch str { | |
case "def": return .def | |
case "extern": return .extern | |
case "if": return .if | |
case "then": return .then | |
case "else": return .else | |
default: return .identifier(str) | |
} | |
} | |
return nil | |
} | |
func lex() -> [Token] { | |
var toks = [Token]() | |
while let tok = advanceToNextToken() { | |
toks.append(tok) | |
} | |
return toks | |
} | |
} | |
let toks = Lexer(input: "def foo(n) (n * 100.34);").lex() | |
print(toks) |
@kdawgwilk I'm so glad you found this useful!
I refactored your code into this:
enum BinaryOperator: Character {
case plus = "+"
case minus = "-"
case times = "*"
case divide = "/"
case mod = "%"
case equals = "="
}
enum Single: Character {
case leftParen = "("
case rightParen = ")"
case comma = ","
case semicolon = ";"
}
enum Keywords: String {
case def, extern, `if`, then, `else`
}
enum Token {
case keys(Keywords)
case identifier(String)
case number(Double)
case `operator`(BinaryOperator)
case sub(Single)
}
and the implementation of advanceToNextToken
func will be simpler (I think):
func advanceToNextToken() -> Token? {
// Skip all spaces until a non-space token
while let char = currentChar, char.isSpace {
advanceIndex()
}
// If we hit the end of the input, then we're done
guard let char = currentChar else {
return nil
}
// Handle single-scalar tokens, like comma,
// leftParen, rightParen, and the operators
if let tokSig = Single(rawValue: char) {
advanceIndex()
return .sub(tokSig)
}
if let tokOp = BinaryOperator(rawValue: char) {
advanceIndex()
return .operator(tokOp)
}
// This is where we parse identifiers or numbers
// We're going to use Swift's built-in double parsing
// logic here.
if char.isAlphanumeric {
let str = readIdentifierOrNumber()
if let dblVal = Double(str) {
return .number(dblVal)
}
// Look for known tokens, otherwise fall back to
// the identifier token
if let k = Keywords(rawValue: str) {
return .keys(k)
}
return .identifier(str)
}
return nil
}
what do you think?
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Thank you so much for taking the time to write the blog post that goes with this. I really became interested in compilers when Swift was released and became open source and this is exactly the type of articles I have been wanting to read. LLVM is fascinating to me and to see it used from Swift makes it feel more approachable than from C/C++. Again thanks and keep it up! Can't wait to read the next part in the series!