Created
June 18, 2023 19:45
-
-
Save cdcarter/ef56a8930a5b13e3ad98c0098a7ed375 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const std = @import("std"); | |
const ast = @import("ast.zig"); | |
const lex = @import("lexer.zig"); | |
fn infixBindingPower(op: ast.Operator) struct { u8, u8 } { | |
return switch (op) { | |
.EXP => return .{ 3, 4 }, | |
.ADD, .SUB => return .{ 5, 6 }, | |
.MUL, .DIV => return .{ 7, 8 }, | |
}; | |
} | |
// returning a null indicates this operator doesn't prefix bind. | |
fn prefixBindingPower(op: ast.Operator) ?u8 { | |
return switch (op) { | |
.ADD, .SUB => 9, | |
else => null, | |
}; | |
} | |
pub const Parser = struct { | |
arena: std.heap.ArenaAllocator = undefined, | |
allocator: std.mem.Allocator = undefined, | |
tokens: []lex.Token, | |
index: usize = 0, | |
fail_msg: []const u8 = "", | |
fn init(self: *Parser, allocator: std.mem.Allocator) void { | |
self.arena = std.heap.ArenaAllocator.init(allocator); // TODO look into the state thingy, can i keep a smaller reference? | |
self.allocator = self.arena.allocator(); | |
} | |
fn deinit(self: *Parser) void { | |
self.arena.deinit(); | |
} | |
inline fn currToken(self: Parser) lex.Token { | |
return self.tokens[self.index]; | |
} | |
inline fn peekToken(self: Parser) ?lex.Token { | |
if ((self.index + 1) >= self.tokens.len) return null; | |
return self.tokens[self.index + 1]; | |
} | |
fn advanceToken(self: *Parser) void { | |
self.index += 1; | |
} | |
fn enterParse(self: *Parser) !?*const ast.Expression { | |
if (self.index == self.tokens.len) return null; | |
return (try self.parseExpression(0)).?; | |
} | |
// XXX some people might make this throw... | |
fn currIsSyntax(self: *Parser, syn: lex.SyntaxToken) bool { | |
const tok = self.currToken(); | |
return switch (tok) { | |
.syntax => |st| { | |
return st == syn; | |
}, | |
else => { | |
return false; | |
}, | |
}; | |
} | |
fn parseExpression(self: *Parser, min_binding_power: u8) !?*const ast.Expression { | |
var lhs = try self.allocator.create(ast.Expression); | |
lhs.* = switch (self.currToken()) { | |
.identifier => |ident| ast.Expression{ .identifier = ident }, | |
.literal => |lit| ast.Expression{ .literal = lit }, | |
.syntax => |syn| oblk: { | |
_ = switch (syn) { | |
.LPAREN => { | |
var temp = try self.parseExpression(0) orelse { | |
try self.fail(error.NoExpressionAfterOpenParen, "parsed an open paren and expected an expression after but found none.", .{}); | |
unreachable; | |
}; | |
self.advanceToken(); | |
if (!self.currIsSyntax(lex.SyntaxToken.RPAREN)) { | |
try self.fail(error.NoCloseParen, "expecting a close paren but found {}.", .{self.currToken()}); | |
unreachable; | |
} | |
break :oblk temp.*; // somethings causing a straight up crash, probably because memory addresses.? parseExpression allocs but we're also allocing here. hrm. | |
}, | |
else => { | |
try self.fail(error.BadToken, "bad token {} for an lval.", .{syn}); | |
unreachable; | |
}, | |
}; | |
}, | |
.operator => |op| blk: { | |
const right_binding_power = prefixBindingPower(op) orelse { | |
try self.fail(error.BadOperatorForPrefix, "bad operator {} for a prefix operation.", .{op}); | |
unreachable; | |
}; | |
self.advanceToken(); | |
const rhs = (try self.parseExpression(right_binding_power)) orelse { | |
try self.fail(error.NoExpressionAfterUnaryOp, "parsed an {} as infix op and expected an expression after but found none.", .{op}); | |
unreachable; | |
}; | |
break :blk ast.Expression{ .unaryOp = .{ .op = op, .right = @constCast(rhs) } }; | |
}, | |
.eof => { | |
return null; | |
}, | |
}; | |
while (self.peekToken()) |peek| { | |
const op = switch (peek) { | |
.operator => |op| op, | |
.eof => break, | |
else => { | |
try self.fail(error.UnhandledTokenType, "bad token {any} for an op", .{self.advanceToken()}); | |
unreachable; | |
}, | |
}; | |
const binding = infixBindingPower(op); // todo should binding power just be a struct?? | |
const left_binding_power = binding[0]; | |
const right_binding_power = binding[1]; | |
if (left_binding_power < min_binding_power) break; | |
self.advanceToken(); | |
self.advanceToken(); | |
var rhs = try self.parseExpression(right_binding_power) orelse { | |
try self.fail(error.DanglingOperator, "just parsed an {}, expecting another expression...", .{op}); | |
unreachable; | |
}; | |
var binNode = try self.allocator.create(ast.Expression); | |
binNode.* = ast.Expression{ .binaryOp = .{ .left = lhs, .right = rhs, .op = op } }; | |
lhs = binNode; | |
} | |
return lhs; | |
} | |
fn fail(self: *Parser, err: anyerror, comptime msg: []const u8, vals: anytype) !void { | |
self.fail_msg = try std.fmt.allocPrint(self.allocator, msg, vals); | |
return err; | |
} | |
}; | |
fn testParse(input: []const u8) !void { | |
var l: lex = .{ .source = input, .allocator = std.testing.allocator }; | |
std.debug.print("parsing `{s}`.\n", .{input}); | |
const tokens = try l.lex(); | |
defer std.testing.allocator.free(tokens); | |
var p: Parser = .{ .tokens = tokens }; | |
p.init(std.testing.allocator); | |
defer p.deinit(); | |
if (p.enterParse()) |tree| { | |
std.debug.print("{any}\n", .{tree}); | |
} else |err| { | |
std.debug.print("Unexpected error during parsing.\nType: {}\nMessage: {s}\n", .{ err, p.fail_msg }); | |
try std.testing.expect(false); | |
} | |
} | |
test "do things compile" { | |
std.debug.print("\n", .{}); | |
try testParse("-123+456"); | |
try testParse("-123"); | |
try testParse("abc"); | |
try testParse("123 + 456"); | |
try testParse("123 + 456 - 333 * 88"); | |
try testParse("1+a+3"); | |
try testParse(""); | |
try testParse(" "); | |
try testParse("(123-456)"); // hehe it crashed without a stack or error | |
//try testParse("1 1 1 "); // this one is broken with unexpected token type void. wow. | |
//todo add asserts | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const std = @import("std"); | |
pub const Operator = enum { | |
ADD, | |
SUB, | |
MUL, | |
DIV, | |
EXP, | |
}; | |
pub const Expression = union(enum) { | |
literal: Literal, | |
identifier: Identifier, | |
binaryOp: BinaryOp, | |
unaryOp: UnaryOp, | |
pub const Literal = i16; | |
pub const Identifier = []const u8; | |
pub const BinaryOp = struct { | |
left: *const Expression, | |
right: *const Expression, | |
op: Operator, | |
}; | |
pub const UnaryOp = struct { | |
right: *Expression, | |
op: Operator, | |
}; | |
pub fn format( | |
self: Expression, | |
comptime fmt: []const u8, | |
options: std.fmt.FormatOptions, | |
writer: anytype, | |
) !void { | |
_ = options; | |
_ = fmt; | |
// todo this whole thing is a bit messy, read more about format strings | |
_ = switch (self) { | |
.literal => |lit| try writer.print("int<{d}>", .{lit}), | |
.identifier => |ident| try writer.print("ident<{s}>", .{ident}), | |
.binaryOp => |bop| { | |
try writer.print("binaryOp<{}, {}, {}>", .{ bop.left, bop.op, bop.right }); | |
}, | |
.unaryOp => |uop| { | |
try writer.print("unaryOp<{}, {}>", .{ uop.op, uop.right }); | |
}, | |
}; | |
} | |
}; |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const std = @import("std"); | |
const ast = @import("ast.zig"); | |
const ascii = std.ascii; | |
// SyntaxTokens are operators/tokens that the AST doesn't care about. | |
pub const SyntaxToken = enum { LPAREN, RPAREN }; | |
pub const TokenType = std.meta.Tag(Token); | |
// if you want line and col, you would start here... | |
pub const Token = union(enum) { | |
literal: i16, | |
identifier: []const u8, | |
operator: ast.Operator, | |
syntax: SyntaxToken, | |
eof: u1, | |
}; | |
source: []const u8, | |
index: usize = 0, | |
allocator: std.mem.Allocator, | |
pub fn lex(self: *@This()) ![]Token { | |
var workingSet = std.ArrayList(Token).init(self.allocator); | |
defer workingSet.deinit(); | |
while (self.index < self.source.len) { | |
var c: u8 = self.source[self.index]; | |
self.index += 1; //todo can this be a continue expression? | |
switch (c) { | |
'0'...'9' => { | |
const start: usize = self.index - 1; | |
while (self.index < self.source.len) { | |
if (!ascii.isDigit(self.source[self.index])) break; | |
self.index += 1; | |
} | |
const val = self.source[start..self.index]; | |
const intVal = try std.fmt.parseInt(i16, val, 10); | |
try workingSet.append(.{ .literal = intVal }); | |
}, | |
'a'...'z', 'A'...'Z' => { | |
const start: usize = self.index - 1; | |
while (self.index < self.source.len) { | |
if (!ascii.isAlphabetic(self.source[self.index])) break; | |
self.index += 1; | |
} | |
const val = self.source[start..self.index]; | |
try workingSet.append(.{ .identifier = val }); | |
}, | |
'+' => { | |
try workingSet.append(.{ .operator = .ADD }); | |
}, | |
'-' => { | |
try workingSet.append(.{ .operator = .SUB }); | |
}, | |
'*' => { | |
try workingSet.append(.{ .operator = .MUL }); | |
}, | |
'/' => { | |
try workingSet.append(.{ .operator = .DIV }); | |
}, | |
'^' => { | |
try workingSet.append(.{ .operator = .EXP }); | |
}, | |
'(' => { | |
try workingSet.append(.{ .syntax = .LPAREN }); | |
}, | |
')' => { | |
try workingSet.append(.{ .syntax = .RPAREN }); | |
}, // TODO how to make lexer switch on ops better??? | |
' ', '\t' => { | |
continue; | |
}, | |
else => { | |
std.debug.print("lex error, unexpected character {c} in {s}\n", .{ c, self.source }); | |
}, | |
} | |
} | |
return workingSet.toOwnedSlice(); | |
} | |
fn testLex(input: []const u8) !void { | |
var lexer: @This() = .{ .source = input, .allocator = std.testing.allocator }; | |
var result = try lexer.lex(); | |
defer std.testing.allocator.free(result); | |
std.debug.print("{any}\n", .{result}); | |
} | |
test "basic lex" { | |
std.debug.print("\n", .{}); | |
try testLex("12343"); | |
try testLex("1+2+3"); | |
try testLex("1+2 + 3"); | |
try testLex("abc + 2 + 3"); | |
//TODO add asserts | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment