Skip to content

Instantly share code, notes, and snippets.

@aquapi
Created November 19, 2024 17:49
Show Gist options
  • Save aquapi/5cdf9eba8181e2a056c94f9af816b30e to your computer and use it in GitHub Desktop.
Save aquapi/5cdf9eba8181e2a056c94f9af816b30e to your computer and use it in GitHub Desktop.
Basic lexer
const std = @import("std");
const mem = std.mem;
pub const Token = struct {
pub const ArrayList = std.ArrayList(Token);
const StringMap = std.StaticStringMap(Kind);
pub const keywordsMap = StringMap.initComptime(.{
// Conditions
.{ "if", Kind.keywordIf },
.{ "else", Kind.keywordElse },
// Loops
.{ "while", Kind.keywordWhile },
.{ "for", Kind.keywordFor },
.{ "continue", Kind.keywordContinue },
.{ "break", Kind.keywordBreak },
// Declarations
.{ "let", Kind.keywordLet },
.{ "const", Kind.keywordConst },
.{ "fn", Kind.keywordFn },
.{ "class", Kind.keywordClass },
.{ "nil", Kind.keywordNil },
});
pub const Kind = enum {
// Conditional keywords
keywordIf,
keywordElse,
// Loop keywords
keywordWhile,
keywordFor,
keywordContinue,
keywordBreak,
// Declarations keywords
keywordLet,
keywordConst,
keywordFn,
keywordClass,
// Constant keywords
keywordNil,
// Literal
literalInt,
literalFloat,
// Parens
leftParen,
rightParen,
// Square brackets
leftSquare,
rightSquare,
// Curly braces
leftCurly,
rightCurly,
// Operators
less,
greater,
equal,
plus,
minus,
// Other types
asterisk,
slash,
dot,
comma,
semicolon,
// Quotes
singleQuote,
doubleQuote,
// Special type
comment,
commentMultilines,
identifier,
unexpected,
};
kind: Kind,
value: []const u8,
};
pub const Lexer = struct {
buf: []const u8,
tokens: Token.ArrayList,
/// Create a token with payload
fn createTokenWithPayload(self: *Lexer, kind: Token.Kind, val: []const u8, newBuf: []const u8) !void {
try self.tokens.append(.{ .kind = kind, .value = val });
self.buf = newBuf;
}
/// Create a token and move to the next position
fn createToken(self: *Lexer, kind: Token.Kind, newBuf: []const u8) !void {
try self.tokens.append(.{ .kind = kind, .value = &[_]u8{} });
self.buf = newBuf;
}
/// Analyze parts after the slash, which can have comments
fn analyzeSlash(self: *Lexer) !void {
switch (self.buf[1]) {
// Single line comment
'/' => {
const newBuf = self.buf[2..];
if (mem.indexOfScalar(u8, newBuf, '\n')) |idx| {
try self.createTokenWithPayload(.comment, newBuf[0..idx], newBuf[idx..]);
} else {
// Comment ends the file
try self.createTokenWithPayload(.comment, newBuf, &[_]u8{});
}
},
'*' => {
const newBuf = self.buf[2..];
if (mem.indexOf(u8, newBuf, "*/")) |idx| {
try self.createTokenWithPayload(.comment, newBuf[0..idx], newBuf[idx..]);
} else {
// Invalid comments
try self.createTokenWithPayload(.unexpected, "/*", newBuf);
}
},
else => try self.createToken(.slash, self.buf[1..]),
}
}
// Analyze identifiers (can be keywords)
fn analyzeIdentifier(self: *Lexer) !void {
var idx: usize = 1;
if (Token.keywordsMap.getLongestPrefix(self.buf)) |kv| {
// If match any of the keyword
const keyLen = kv.key.len;
switch (self.buf[keyLen]) {
'A'...'Z', 'a'...'z', '0'...'9', '_' => {
// Still an identifier
idx = keyLen + 1;
},
else => {
// Skip to after the keyword
try self.createToken(kv.value, self.buf[keyLen..]);
return;
},
}
}
while (idx < self.buf.len) {
switch (self.buf[idx]) {
'A'...'Z', 'a'...'z', '0'...'9', '_' => {
idx += 1;
continue;
},
else => {
try self.createTokenWithPayload(.identifier, self.buf[0..idx], self.buf[idx..]);
return;
},
}
}
try self.createTokenWithPayload(.identifier, self.buf, &[_]u8{});
}
// Analyze numbers
fn analyzeNumber(self: *Lexer) !void {
var isFloat: bool = false;
for (self.buf[1..], 1..) |ch, idx| {
switch (ch) {
'0'...'9' => continue,
'.' => {
// Caught double dot early
if (isFloat) {
try self.createTokenWithPayload(.literalFloat, self.buf[0 .. idx - 1], self.buf);
try self.createTokenWithPayload(.unexpected, ".", self.buf[idx..]);
return;
}
isFloat = true;
},
else => {
try self.createTokenWithPayload(if (isFloat) .literalFloat else .literalInt, self.buf[0..idx], self.buf[idx..]);
return;
},
}
}
try self.createTokenWithPayload(if (isFloat) .literalFloat else .literalInt, self.buf, &[_]u8{});
}
pub fn run(self: *Lexer) !void {
while (self.buf.len != 0) {
switch (self.buf[0]) {
// Ignore whitespaces
' ', '\t', '\r', '\n' => self.buf = self.buf[1..],
'A'...'Z', 'a'...'z', '_' => try self.analyzeIdentifier(),
'0'...'9' => try self.analyzeNumber(),
'(' => try self.createToken(.leftParen, self.buf[1..]),
')' => try self.createToken(.rightParen, self.buf[1..]),
'[' => try self.createToken(.leftSquare, self.buf[1..]),
']' => try self.createToken(.rightSquare, self.buf[1..]),
'{' => try self.createToken(.leftCurly, self.buf[1..]),
'}' => try self.createToken(.rightCurly, self.buf[1..]),
'<' => try self.createToken(.less, self.buf[1..]),
'>' => try self.createToken(.greater, self.buf[1..]),
'=' => try self.createToken(.equal, self.buf[1..]),
'+' => try self.createToken(.plus, self.buf[1..]),
'-' => try self.createToken(.minus, self.buf[1..]),
'*' => try self.createToken(.asterisk, self.buf[1..]),
'/' => try self.analyzeSlash(),
'.' => try self.createToken(.dot, self.buf[1..]),
',' => try self.createToken(.comma, self.buf[1..]),
';' => try self.createToken(.semicolon, self.buf[1..]),
'\'' => try self.createToken(.singleQuote, self.buf[1..]),
'"' => try self.createToken(.doubleQuote, self.buf[1..]),
else => |ch| try self.createTokenWithPayload(.unexpected, &[_]u8{ch}, self.buf[1..]),
}
}
}
};
pub fn main() !void {
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
const allocator = gpa.allocator();
const program =
\\ fn main() void {
\\ let x = 9;
\\ x *= 16;
\\ println("Hello world");
\\ println(x);
\\ }
;
var lexer = Lexer{ .buf = program, .tokens = Token.ArrayList.init(allocator) };
try lexer.run();
std.debug.print("{any}\n", .{lexer.tokens});
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment