Last active
March 4, 2023 20:10
-
-
Save harlanhaskins/1d14f1ab048256d8dfa2f875f893b30d to your computer and use it in GitHub Desktop.
Building a Compiler in Swift with LLVM, Part 1: Introduction and the Lexer
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#if os(macOS) | |
import Darwin | |
#elseif os(Linux) | |
import Glibc | |
#endif | |
enum BinaryOperator: Character { | |
case plus = "+" | |
case minus = "-" | |
case times = "*" | |
case divide = "/" | |
case mod = "%" | |
case equals = "=" | |
} | |
enum Token { | |
case leftParen, rightParen, def, extern, comma, semicolon, `if`, then, `else` | |
case identifier(String) | |
case number(Double) | |
case `operator`(BinaryOperator) | |
} | |
extension Character { | |
var value: Int32 { | |
return Int32(String(self).unicodeScalars.first!.value) | |
} | |
var isSpace: Bool { | |
return isspace(value) != 0 | |
} | |
var isAlphanumeric: Bool { | |
return isalnum(value) != 0 || self == "_" | |
} | |
} | |
class Lexer { | |
let input: String | |
var index: String.Index | |
init(input: String) { | |
self.input = input | |
self.index = input.startIndex | |
} | |
var currentChar: Character? { | |
return index < input.endIndex ? input[index] : nil | |
} | |
func advanceIndex() { | |
input.characters.formIndex(after: &index) | |
} | |
func readIdentifierOrNumber() -> String { | |
var str = "" | |
while let char = currentChar, char.isAlphanumeric || char == "." { | |
str.characters.append(char) | |
advanceIndex() | |
} | |
return str | |
} | |
func advanceToNextToken() -> Token? { | |
// Skip all spaces until a non-space token | |
while let char = currentChar, char.isSpace { | |
advanceIndex() | |
} | |
// If we hit the end of the input, then we're done | |
guard let char = currentChar else { | |
return nil | |
} | |
// Handle single-scalar tokens, like comma, | |
// leftParen, rightParen, and the operators | |
let singleTokMapping: [Character: Token] = [ | |
",": .comma, "(": .leftParen, ")": .rightParen, | |
";": .semicolon, "+": .operator(.plus), "-": .operator(.minus), | |
"*": .operator(.times), "/": .operator(.divide), | |
"%": .operator(.mod), "=": .operator(.equals) | |
] | |
if let tok = singleTokMapping[char] { | |
advanceIndex() | |
return tok | |
} | |
// This is where we parse identifiers or numbers | |
// We're going to use Swift's built-in double parsing | |
// logic here. | |
if char.isAlphanumeric { | |
var str = readIdentifierOrNumber() | |
if let dblVal = Double(str) { | |
return .number(dblVal) | |
} | |
// Look for known tokens, otherwise fall back to | |
// the identifier token | |
switch str { | |
case "def": return .def | |
case "extern": return .extern | |
case "if": return .if | |
case "then": return .then | |
case "else": return .else | |
default: return .identifier(str) | |
} | |
} | |
return nil | |
} | |
func lex() -> [Token] { | |
var toks = [Token]() | |
while let tok = advanceToNextToken() { | |
toks.append(tok) | |
} | |
return toks | |
} | |
} | |
let toks = Lexer(input: "def foo(n) (n * 100.34);").lex() | |
print(toks) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I refactored your code into this:
and the implementation of
advanceToNextToken
func will be simpler (I think):what do you think?