Created
November 19, 2024 17:49
-
-
Save aquapi/5cdf9eba8181e2a056c94f9af816b30e to your computer and use it in GitHub Desktop.
Basic lexer
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const std = @import("std"); | |
const mem = std.mem; | |
pub const Token = struct { | |
pub const ArrayList = std.ArrayList(Token); | |
const StringMap = std.StaticStringMap(Kind); | |
pub const keywordsMap = StringMap.initComptime(.{ | |
// Conditions | |
.{ "if", Kind.keywordIf }, | |
.{ "else", Kind.keywordElse }, | |
// Loops | |
.{ "while", Kind.keywordWhile }, | |
.{ "for", Kind.keywordFor }, | |
.{ "continue", Kind.keywordContinue }, | |
.{ "break", Kind.keywordBreak }, | |
// Declarations | |
.{ "let", Kind.keywordLet }, | |
.{ "const", Kind.keywordConst }, | |
.{ "fn", Kind.keywordFn }, | |
.{ "class", Kind.keywordClass }, | |
.{ "nil", Kind.keywordNil }, | |
}); | |
pub const Kind = enum { | |
// Conditional keywords | |
keywordIf, | |
keywordElse, | |
// Loop keywords | |
keywordWhile, | |
keywordFor, | |
keywordContinue, | |
keywordBreak, | |
// Declarations keywords | |
keywordLet, | |
keywordConst, | |
keywordFn, | |
keywordClass, | |
// Constant keywords | |
keywordNil, | |
// Literal | |
literalInt, | |
literalFloat, | |
// Parens | |
leftParen, | |
rightParen, | |
// Square brackets | |
leftSquare, | |
rightSquare, | |
// Curly braces | |
leftCurly, | |
rightCurly, | |
// Operators | |
less, | |
greater, | |
equal, | |
plus, | |
minus, | |
// Other types | |
asterisk, | |
slash, | |
dot, | |
comma, | |
semicolon, | |
// Quotes | |
singleQuote, | |
doubleQuote, | |
// Special type | |
comment, | |
commentMultilines, | |
identifier, | |
unexpected, | |
}; | |
kind: Kind, | |
value: []const u8, | |
}; | |
pub const Lexer = struct { | |
buf: []const u8, | |
tokens: Token.ArrayList, | |
/// Create a token with payload | |
fn createTokenWithPayload(self: *Lexer, kind: Token.Kind, val: []const u8, newBuf: []const u8) !void { | |
try self.tokens.append(.{ .kind = kind, .value = val }); | |
self.buf = newBuf; | |
} | |
/// Create a token and move to the next position | |
fn createToken(self: *Lexer, kind: Token.Kind, newBuf: []const u8) !void { | |
try self.tokens.append(.{ .kind = kind, .value = &[_]u8{} }); | |
self.buf = newBuf; | |
} | |
/// Analyze parts after the slash, which can have comments | |
fn analyzeSlash(self: *Lexer) !void { | |
switch (self.buf[1]) { | |
// Single line comment | |
'/' => { | |
const newBuf = self.buf[2..]; | |
if (mem.indexOfScalar(u8, newBuf, '\n')) |idx| { | |
try self.createTokenWithPayload(.comment, newBuf[0..idx], newBuf[idx..]); | |
} else { | |
// Comment ends the file | |
try self.createTokenWithPayload(.comment, newBuf, &[_]u8{}); | |
} | |
}, | |
'*' => { | |
const newBuf = self.buf[2..]; | |
if (mem.indexOf(u8, newBuf, "*/")) |idx| { | |
try self.createTokenWithPayload(.comment, newBuf[0..idx], newBuf[idx..]); | |
} else { | |
// Invalid comments | |
try self.createTokenWithPayload(.unexpected, "/*", newBuf); | |
} | |
}, | |
else => try self.createToken(.slash, self.buf[1..]), | |
} | |
} | |
// Analyze identifiers (can be keywords) | |
fn analyzeIdentifier(self: *Lexer) !void { | |
var idx: usize = 1; | |
if (Token.keywordsMap.getLongestPrefix(self.buf)) |kv| { | |
// If match any of the keyword | |
const keyLen = kv.key.len; | |
switch (self.buf[keyLen]) { | |
'A'...'Z', 'a'...'z', '0'...'9', '_' => { | |
// Still an identifier | |
idx = keyLen + 1; | |
}, | |
else => { | |
// Skip to after the keyword | |
try self.createToken(kv.value, self.buf[keyLen..]); | |
return; | |
}, | |
} | |
} | |
while (idx < self.buf.len) { | |
switch (self.buf[idx]) { | |
'A'...'Z', 'a'...'z', '0'...'9', '_' => { | |
idx += 1; | |
continue; | |
}, | |
else => { | |
try self.createTokenWithPayload(.identifier, self.buf[0..idx], self.buf[idx..]); | |
return; | |
}, | |
} | |
} | |
try self.createTokenWithPayload(.identifier, self.buf, &[_]u8{}); | |
} | |
// Analyze numbers | |
fn analyzeNumber(self: *Lexer) !void { | |
var isFloat: bool = false; | |
for (self.buf[1..], 1..) |ch, idx| { | |
switch (ch) { | |
'0'...'9' => continue, | |
'.' => { | |
// Caught double dot early | |
if (isFloat) { | |
try self.createTokenWithPayload(.literalFloat, self.buf[0 .. idx - 1], self.buf); | |
try self.createTokenWithPayload(.unexpected, ".", self.buf[idx..]); | |
return; | |
} | |
isFloat = true; | |
}, | |
else => { | |
try self.createTokenWithPayload(if (isFloat) .literalFloat else .literalInt, self.buf[0..idx], self.buf[idx..]); | |
return; | |
}, | |
} | |
} | |
try self.createTokenWithPayload(if (isFloat) .literalFloat else .literalInt, self.buf, &[_]u8{}); | |
} | |
pub fn run(self: *Lexer) !void { | |
while (self.buf.len != 0) { | |
switch (self.buf[0]) { | |
// Ignore whitespaces | |
' ', '\t', '\r', '\n' => self.buf = self.buf[1..], | |
'A'...'Z', 'a'...'z', '_' => try self.analyzeIdentifier(), | |
'0'...'9' => try self.analyzeNumber(), | |
'(' => try self.createToken(.leftParen, self.buf[1..]), | |
')' => try self.createToken(.rightParen, self.buf[1..]), | |
'[' => try self.createToken(.leftSquare, self.buf[1..]), | |
']' => try self.createToken(.rightSquare, self.buf[1..]), | |
'{' => try self.createToken(.leftCurly, self.buf[1..]), | |
'}' => try self.createToken(.rightCurly, self.buf[1..]), | |
'<' => try self.createToken(.less, self.buf[1..]), | |
'>' => try self.createToken(.greater, self.buf[1..]), | |
'=' => try self.createToken(.equal, self.buf[1..]), | |
'+' => try self.createToken(.plus, self.buf[1..]), | |
'-' => try self.createToken(.minus, self.buf[1..]), | |
'*' => try self.createToken(.asterisk, self.buf[1..]), | |
'/' => try self.analyzeSlash(), | |
'.' => try self.createToken(.dot, self.buf[1..]), | |
',' => try self.createToken(.comma, self.buf[1..]), | |
';' => try self.createToken(.semicolon, self.buf[1..]), | |
'\'' => try self.createToken(.singleQuote, self.buf[1..]), | |
'"' => try self.createToken(.doubleQuote, self.buf[1..]), | |
else => |ch| try self.createTokenWithPayload(.unexpected, &[_]u8{ch}, self.buf[1..]), | |
} | |
} | |
} | |
}; | |
pub fn main() !void { | |
var gpa = std.heap.GeneralPurposeAllocator(.{}){}; | |
const allocator = gpa.allocator(); | |
const program = | |
\\ fn main() void { | |
\\ let x = 9; | |
\\ x *= 16; | |
\\ println("Hello world"); | |
\\ println(x); | |
\\ } | |
; | |
var lexer = Lexer{ .buf = program, .tokens = Token.ArrayList.init(allocator) }; | |
try lexer.run(); | |
std.debug.print("{any}\n", .{lexer.tokens}); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment