Created
September 28, 2018 20:35
-
-
Save itsbth/a6018a4628a1d0f93df4842c532d7986 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const std = @import("std"); | |
const assert = std.debug.assert; | |
const mem = std.mem; | |
pub const Token = struct { | |
id: Id, | |
start: usize, | |
end: usize, | |
pub const Keyword = struct { | |
bytes: []const u8, | |
id: Id, | |
}; | |
pub const keywords = []Keyword{ | |
Keyword { .bytes = "var", .id = Id.Keyword_Var, }, | |
}; | |
fn getKeyword(bytes: []const u8) ?Id { | |
for (keywords) |kw| { | |
if (mem.eql(u8, kw.bytes, bytes)) { | |
return kw.id; | |
} | |
} | |
return null; | |
} | |
pub const Id = enum { | |
Invalid, | |
Eof, | |
Number, | |
Identifier, | |
Keyword_Var, | |
Semi, | |
LParen, | |
RParen, | |
LBrace, | |
RBrace, | |
LBracket, | |
RBracket, | |
Op_Plus, | |
Op_Minus, | |
Op_Star, | |
Op_Slash, | |
Op_Equal, | |
Op_EqualEqual, | |
Op_Less, | |
Op_LessEqual, | |
Op_Greater, | |
Op_GreaterEqual, | |
}; | |
}; | |
pub const Tokenizer = struct { | |
buffer: []const u8, | |
index: usize, | |
pub fn init(buffer: []const u8) Tokenizer { | |
return Tokenizer { | |
.buffer = buffer, | |
.index = 0, | |
}; | |
} | |
pub fn dump(self: *Tokenizer, token: *const Token) void { | |
std.debug.warn("{} \"{}\"\n", @tagName(token.id), self.buffer[token.start..token.end]); | |
} | |
const State = enum { | |
Start, | |
Number, | |
NumberFraction, | |
Identifier, | |
Slash, | |
LineComment, | |
Equals, | |
Compare, | |
}; | |
pub fn next(self: *Tokenizer) Token { | |
var state = State.Start; | |
var result = Token { | |
.id = Token.Id.Eof, | |
.start = self.index, | |
.end = undefined, | |
}; | |
while (self.index < self.buffer.len) : (self.index += 1) { | |
const c = self.buffer[self.index]; | |
switch (state) { | |
State.Start => switch (c) { | |
' ', '\n', '\t' => result.start += 1, | |
'a'...'z', 'A'...'Z', '_' => { | |
state = State.Identifier; | |
result.id = Token.Id.Identifier; | |
}, | |
'0'...'9' => { | |
state = State.Number; | |
result.id = Token.Id.Number; | |
}, | |
'+' => { | |
result.id = Token.Id.Op_Plus; | |
self.index += 1; | |
break; | |
}, | |
'-' => { | |
result.id = Token.Id.Op_Minus; | |
self.index += 1; | |
break; | |
}, | |
'*' => { | |
result.id = Token.Id.Op_Star; | |
self.index += 1; | |
break; | |
}, | |
'/' => { | |
state = State.Slash; | |
}, | |
'=' => { | |
state = State.Equals; | |
}, | |
';' => { | |
result.id = Token.Id.Semi; | |
self.index += 1; | |
break; | |
}, | |
'(' => { | |
result.id = Token.Id.LParen; | |
self.index += 1; | |
break; | |
}, | |
')' => { | |
result.id = Token.Id.RParen; | |
self.index += 1; | |
break; | |
}, | |
'<', '>' => { | |
result.id = if (c == '<') Token.Id.Op_Less else Token.Id.Op_Greater; | |
state = State.Compare; | |
}, | |
else => std.debug.panic("can't handle {} at {}", c, self.index), | |
}, | |
State.Number => switch (c) { | |
'0'...'9' => {}, | |
'.' => state = State.NumberFraction, | |
else => break, | |
}, | |
State.NumberFraction => switch (c) { | |
'0'...'9' => {}, | |
else => break, | |
}, | |
State.Identifier => switch (c) { | |
'a'...'z', 'A'...'Z', '0'...'9', '_' => {}, | |
else => { | |
if (Token.getKeyword(self.buffer[result.start..self.index])) |id| { | |
result.id = id; | |
} | |
break; | |
} | |
}, | |
State.Slash => switch (c) { | |
'/' => state = State.LineComment, | |
else => { | |
result.id = Token.Id.Op_Slash; | |
break; | |
}, | |
}, | |
State.LineComment => switch (c) { | |
'\n' => { | |
result.start = self.index + 1; | |
self.index += 1; | |
state = State.Start; | |
}, | |
else => {}, | |
}, | |
State.Equals => switch (c) { | |
'=' => { | |
result.id = Token.Id.Op_EqualEqual; | |
self.index += 1; | |
break; | |
}, | |
else => { | |
result.id = Token.Id.Op_Equal; | |
break; | |
} | |
}, | |
State.Compare => switch (c) { | |
'=' => { | |
result.id = if (result.id == Token.Id.Op_Less) | |
Token.Id.Op_LessEqual | |
else Token.Id.Op_GreaterEqual; | |
self.index += 1; | |
break; | |
}, | |
else => break, | |
} | |
} | |
} | |
result.end = self.index; | |
return result; | |
} | |
}; | |
test "create new tokenizer" { | |
const ss = Tokenizer.init(&"text goes here"); | |
} | |
test "tokenize number" { | |
testTokenize(&"1337 42.5", []Token.Id{ Token.Id.Number, Token.Id.Number }); | |
} | |
test "tokenize identifier" { | |
testTokenize(&"var foo", []Token.Id{ Token.Id.Keyword_Var, Token.Id.Identifier }); | |
} | |
fn testTokenize(source: []const u8, expected_tokens: []const Token.Id) void { | |
var tokenizer = Tokenizer.init(source); | |
for (expected_tokens) |expected_token_id| { | |
const token = tokenizer.next(); | |
if (token.id != expected_token_id) { | |
std.debug.panic("expected {}, found {}\n", @tagName(expected_token_id), @tagName(token.id)); | |
} | |
} | |
const last_token = tokenizer.next(); | |
std.debug.assert(last_token.id == Token.Id.Eof); | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment