NOTE: This is not meant to be product-ready for every use case out there, but is only here for curious people who are reading my articles. That's why I didn't create a GitHub project out of it: it needs a bit more work and untangling of test files to be useful to people.
Last active
July 30, 2025 18:13
-
-
Save ShishKabab/18777096cbd2d07498bda44f0d11ce72 to your computer and use it in GitHub Desktop.
A zero-allocation YAML parser written in Zig supporting a subset of YAML I needed for my own project.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const std = @import("std"); | |
const StaticBitStack = @import("utils.zig").StaticBitStack; | |
pub const YamlLexerTokenType = enum { | |
identifier, | |
colon, | |
integer, | |
float, | |
string_single_line, | |
bool, | |
whitespace, | |
newline, | |
object_start, | |
object_end, | |
array_start, | |
array_end, | |
array_item, | |
comma, | |
comment, | |
}; | |
pub const YamlLexerTokenIndex = u32; | |
pub const YamlLexerToken = struct { | |
start_index: YamlLexerTokenIndex, | |
end_index: YamlLexerTokenIndex, | |
content: YamlLexerTokenContent, | |
}; | |
pub const YamlLexerTokenList = std.ArrayList(YamlLexerToken); | |
pub const YamlLexerTokenContent = union(YamlLexerTokenType) { | |
identifier: void, | |
colon: void, | |
integer: i64, | |
float: f64, | |
string_single_line: YamlLexerStringTokenContent, | |
bool: bool, | |
whitespace: void, | |
newline: void, | |
object_start: void, | |
object_end: void, | |
array_start: void, | |
array_end: void, | |
array_item: void, | |
comma: void, | |
comment: void, | |
}; | |
pub const YamlLexerStringTokenContent = struct { | |
has_escapes: bool, | |
}; | |
pub const YamlLexerError = error{ | |
unexpected_char, | |
nesting_depth_exceeded, | |
unexpected_nesting_end, | |
}; | |
const YamlLexerObjectEntryLineStage = enum { | |
start, | |
newline, | |
leading_whitespace, | |
array_item, | |
last_whitespace, | |
identifier, | |
colon, | |
none, | |
}; | |
pub const YamlNestingType = enum { | |
object, | |
array, | |
}; | |
pub const YamlNesting = struct { | |
bit_stack: StaticBitStack = .{}, | |
// debug: bool = false, | |
pub fn push(self: *YamlNesting, nesting_type: YamlNestingType) !void { | |
// if (self.debug) { | |
// std.debug.print("push: {any}\n", .{nesting_type}); | |
// } | |
try self.bit_stack.push(if (nesting_type == .object) 1 else 0); | |
} | |
pub fn get(self: YamlNesting) ?YamlNestingType { | |
const bit = self.bit_stack.get() orelse { | |
return null; | |
}; | |
return if (bit == 1) .object else .array; | |
} | |
pub fn pop(self: *YamlNesting, maybe_expected_nesting_type: ?YamlNestingType) !void { | |
// if (self.debug) { | |
// std.debug.print("pop (old): {any}\n", .{self.get()}); | |
// } | |
if (self.get()) |nesting| { | |
if (maybe_expected_nesting_type) |expected_nesting_type| { | |
if (nesting != expected_nesting_type) { | |
return YamlLexerError.unexpected_nesting_end; | |
} | |
} | |
} else { | |
return YamlLexerError.unexpected_nesting_end; | |
} | |
try self.bit_stack.pop(); | |
// if (self.debug) { | |
// std.debug.print("pop (new): {any}\n", .{self.get()}); | |
// } | |
} | |
}; | |
pub const YamlLexerTokenInterator = struct { | |
source: []const u8, | |
start_index: YamlLexerTokenIndex = 0, | |
object_depth: u8 = 0, | |
object_entry_line: YamlLexerObjectEntryLineStage = .start, | |
last_token: ?YamlLexerToken = null, | |
inline_nesting: YamlNesting = .{}, | |
pub fn init(source: []const u8) YamlLexerTokenInterator { | |
return .{ | |
.source = source, | |
}; | |
} | |
pub fn next(self: *YamlLexerTokenInterator) !?YamlLexerToken { | |
if (self.start_index >= self.source.len) { | |
return null; | |
} | |
var token: YamlLexerToken = undefined; | |
const at_start_index: u8 = self.source[self.start_index]; | |
var is_entry_value = false; | |
if (self.object_entry_line != .none) { | |
if (self.last_token) |last_token| { | |
if (self.object_entry_line == .newline and last_token.content == .newline) { | |
self.object_entry_line = .start; | |
} else if (self.object_entry_line == .start and last_token.content == .identifier) { | |
self.object_entry_line = .identifier; | |
} else if (self.object_entry_line == .start and last_token.content == .whitespace) { | |
self.object_entry_line = .leading_whitespace; | |
} else if (self.object_entry_line == .leading_whitespace and last_token.content == .identifier) { | |
self.object_entry_line = .identifier; | |
} else if (self.object_entry_line == .leading_whitespace and last_token.content == .array_item) { | |
self.object_entry_line = .array_item; | |
} else if (self.object_entry_line == .array_item and last_token.content == .whitespace) { | |
self.object_entry_line = .last_whitespace; | |
} else if (self.object_entry_line == .last_whitespace and last_token.content == .identifier) { | |
self.object_entry_line = .identifier; | |
} else if (self.object_entry_line == .identifier and last_token.content == .colon) { | |
self.object_entry_line = .colon; | |
} else if (self.object_entry_line == .colon and last_token.content == .whitespace) { | |
is_entry_value = true; | |
self.object_entry_line = .none; | |
} else { | |
self.object_entry_line = .none; | |
} | |
} | |
} | |
if (at_start_index == ':') { | |
token = self.simpleToken(1, .{ .colon = {} }); | |
self.start_index = token.end_index; | |
} else if (at_start_index == ',') { | |
token = self.simpleToken(1, .{ .comma = {} }); | |
self.start_index = token.end_index; | |
} else if (at_start_index == '-') { | |
token = self.simpleToken(1, .{ .array_item = {} }); | |
self.start_index = token.end_index; | |
} else if (at_start_index == '{') { | |
try self.inline_nesting.push(.object); | |
token = self.simpleToken(1, .{ .object_start = {} }); | |
self.start_index = token.end_index; | |
} else if (at_start_index == '}') { | |
try self.inline_nesting.pop(.object); | |
token = self.simpleToken(1, .{ .object_end = {} }); | |
self.start_index = token.end_index; | |
} else if (at_start_index == '[') { | |
try self.inline_nesting.push(.array); | |
token = self.simpleToken(1, .{ .array_start = {} }); | |
self.start_index = token.end_index; | |
} else if (at_start_index == ']') { | |
try self.inline_nesting.pop(.array); | |
token = self.simpleToken(1, .{ .array_end = {} }); | |
self.start_index = token.end_index; | |
} else if (at_start_index == ' ' or at_start_index == '\t') { | |
var end_index = self.start_index + 1; | |
while (end_index < self.source.len) : (end_index += 1) { | |
const at_end_index = self.source[end_index]; | |
if (at_end_index != ' ' and at_end_index != '\t') { | |
break; | |
} | |
} | |
token = YamlLexerToken{ | |
.start_index = self.start_index, | |
.end_index = end_index, | |
.content = .{ .whitespace = {} }, | |
}; | |
self.start_index = token.end_index; | |
} else if (at_start_index == '\n' or at_start_index == '\r') { | |
const is_windows = at_start_index == '\r' and self.start_index + 1 < self.source.len and self.source[self.start_index + 1] == '\n'; | |
const offset: u8 = if (is_windows) 2 else 1; | |
token = self.simpleToken(offset, .{ .newline = {} }); | |
self.object_entry_line = .newline; | |
self.start_index = token.end_index; | |
} else if (at_start_index == '#') { | |
var end_index = self.start_index + 1; | |
while (end_index < self.source.len) : (end_index += 1) { | |
const at_end_index = self.source[end_index]; | |
if (at_end_index == '\r' or at_end_index == '\n') { | |
break; | |
} | |
} | |
token = YamlLexerToken{ | |
.start_index = self.start_index, | |
.end_index = end_index, | |
.content = .{ .comment = {} }, | |
}; | |
self.start_index = token.end_index; | |
} else { | |
token = try self.parseValue(is_entry_value); | |
} | |
self.last_token = token; | |
return token; | |
} | |
fn parseValue(self: *YamlLexerTokenInterator, is_entry_value: bool) !YamlLexerToken { | |
const at_start_index: u8 = self.source[self.start_index]; | |
if (at_start_index == '\'' or at_start_index == '"') { | |
const token = self.parseQuotedString(at_start_index); | |
self.start_index = token.end_index + 1; | |
return token; | |
} else if (at_start_index >= '0' and at_start_index <= '9') { | |
const token = try self.parseNumber(); | |
self.start_index = token.end_index; | |
return token; | |
} | |
var end_index = self.start_index + 1; | |
const maybe_nesting = self.inline_nesting.get(); | |
var maybe_whitespace_start: ?YamlLexerTokenIndex = null; | |
while (end_index < self.source.len) : (end_index += 1) { | |
const at_end_index = self.source[end_index]; | |
if (at_end_index == '\n' or at_end_index == '\r') { | |
break; | |
} | |
if (!is_entry_value and (at_end_index == ',' or at_end_index == ':')) { | |
break; | |
} | |
if (maybe_nesting) |nesting| { | |
if (at_end_index == ' ' and maybe_whitespace_start == null) { | |
maybe_whitespace_start = end_index; | |
} | |
if ((nesting == .array and at_end_index == ']') or (nesting == .object and at_end_index == '}')) { | |
if (maybe_whitespace_start) |whitespace_start| { | |
end_index = whitespace_start; | |
} | |
break; | |
} | |
if (at_end_index != ' ') { | |
maybe_whitespace_start = null; | |
} | |
} | |
} | |
var token = YamlLexerToken{ .start_index = self.start_index, .end_index = end_index, .content = .{ | |
.string_single_line = .{ .has_escapes = false }, | |
} }; | |
const is_key = token.end_index < self.source.len and self.source[token.end_index] == ':'; | |
if (!is_key) { | |
const sliced = self.source[token.start_index..token.end_index]; | |
if (std.mem.eql(u8, sliced, "true")) { | |
token.content = .{ .bool = true }; | |
} else if (std.mem.eql(u8, sliced, "false")) { | |
token.content = .{ .bool = false }; | |
} | |
} else { | |
token.content = .{ .identifier = {} }; | |
} | |
self.start_index = token.end_index; | |
return token; | |
} | |
fn parseNumber(self: *YamlLexerTokenInterator) !YamlLexerToken { | |
var end_index = self.start_index + 1; | |
var dot_found = false; | |
while (end_index < self.source.len) : (end_index += 1) { | |
const at_end_index = self.source[end_index]; | |
if (at_end_index == '.') { | |
if (dot_found) { | |
self.start_index = end_index; | |
return YamlLexerError.unexpected_char; | |
} else { | |
dot_found = true; | |
} | |
} else if (at_end_index < '0' or at_end_index > '9') { | |
break; | |
} | |
} | |
const sliced = self.source[self.start_index..end_index]; | |
return YamlLexerToken{ | |
.start_index = self.start_index, | |
.end_index = end_index, | |
.content = if (dot_found) .{ | |
.float = try std.fmt.parseFloat(f64, sliced), | |
} else .{ | |
.integer = try std.fmt.parseInt(i64, sliced, 10), | |
}, | |
}; | |
} | |
fn parseQuotedString(self: YamlLexerTokenInterator, quote: u8) YamlLexerToken { | |
var end_index = self.start_index + 1; | |
var escaped = false; | |
var has_escapes = false; | |
while (end_index < self.source.len) : (end_index += 1) { | |
const at_end_index = self.source[end_index]; | |
if (at_end_index == '\\') { | |
escaped = !escaped; | |
has_escapes = true; | |
} else if (at_end_index == quote and !escaped) { | |
break; | |
} else { | |
escaped = false; | |
} | |
} | |
return YamlLexerToken{ | |
.start_index = self.start_index + 1, | |
.end_index = end_index, | |
.content = .{ .string_single_line = .{ .has_escapes = has_escapes } }, | |
}; | |
} | |
fn simpleToken(self: YamlLexerTokenInterator, length: YamlLexerTokenIndex, content: YamlLexerTokenContent) YamlLexerToken { | |
return YamlLexerToken{ | |
.start_index = self.start_index, | |
.end_index = self.start_index + length, | |
.content = content, | |
}; | |
} | |
}; | |
fn isValidIdentifierChar(char: u8, first: bool) bool { | |
return (!first and char >= '0' and char <= '9') or (char >= 'a' and char <= 'z') or (char >= 'A' and char <= 'Z') or char == '_' or char == '.'; | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const std = @import("std"); | |
const lexer = @import("lexer.zig"); | |
const StaticBitStack = @import("utils.zig").StaticBitStack; | |
const most_significant_bit: u64 = 0b1000000000000000000000000000000000000000000000000000000000000000; | |
pub const YamlParserToken = struct { | |
start_index: YamlParserTokenIndex, | |
end_index: YamlParserTokenIndex, | |
content: YamlParserTokenContent, | |
}; | |
pub const YamlParserTokenList = std.ArrayList(YamlParserToken); | |
pub const YamlParserTokenType = enum { | |
bool, | |
float, | |
integer, | |
string_single_line, | |
object_entry_start, | |
object_entry_end, | |
array_item_start, | |
array_item_end, | |
}; | |
pub const YamlParserTokenContent = union(YamlParserTokenType) { | |
bool: bool, | |
float: f64, | |
integer: i64, | |
string_single_line: YamlParserStringTokenContent, | |
object_entry_start: void, | |
object_entry_end: void, | |
array_item_start: void, | |
array_item_end: void, | |
}; | |
pub const YamlParserStringTokenContent = struct { | |
has_escapes: bool, | |
}; | |
pub const YamlParserTokenIndex = u32; | |
pub const YamlParserError = error{ | |
unexpected_token, | |
unexpected_end, | |
invalid_indent, | |
invalid_outdent, | |
invalid_indentation, | |
}; | |
const ParseNewLineOutput = struct { | |
next_input_token: ?lexer.YamlLexerToken = null, | |
output_token: ?YamlParserToken = null, | |
}; | |
pub const YamlParserIterator = struct { | |
source: []const u8, | |
lexer_iterator: lexer.YamlLexerTokenInterator, | |
is_new_line: bool = true, | |
last_input_token: ?lexer.YamlLexerToken = null, | |
next_input_token: ?lexer.YamlLexerToken = null, | |
block_nesting: lexer.YamlNesting = .{}, | |
// for each character from the least significant bit, 0 for spaces, 1 for tabs | |
leading_whitespace_chars: u64 = 0, | |
leading_whitespace_size: u6 = 0, | |
// 1 for each index that started a indentation level | |
leading_whitespace_levels: u64 = 0, | |
indentation_level: u6 = 0, | |
outdents_left: u6 = 0, | |
last_newline_index: YamlParserTokenIndex = 0, | |
last_newline_length: u2 = 0, | |
array_item_expected: bool = false, | |
array_item_starts_left: u8 = 0, | |
next_output_token: ?YamlParserToken = null, | |
lexer_time: u64 = 0, | |
pub fn init(source: []const u8, lexer_iterator: lexer.YamlLexerTokenInterator) YamlParserIterator { | |
return .{ | |
.source = source, | |
.lexer_iterator = lexer_iterator, | |
}; | |
} | |
pub fn next(self: *YamlParserIterator) !?YamlParserToken { | |
const was_file_start = self.last_input_token == null; | |
if (self.outdents_left > 0) { | |
self.outdents_left -= 1; | |
const block_type = self.block_nesting.get() orelse { | |
return YamlParserError.invalid_outdent; | |
}; | |
try self.block_nesting.pop(null); | |
return .{ | |
.start_index = self.last_newline_index, | |
.end_index = self.last_newline_index + self.last_newline_length, | |
.content = switch (block_type) { | |
.object => .{ .object_entry_end = {} }, | |
.array => .{ .array_item_end = {} }, | |
}, | |
}; | |
} | |
if (self.array_item_starts_left > 0) { | |
self.array_item_starts_left -= 1; | |
const start_index = self.next_output_token.?.start_index; | |
return .{ .start_index = start_index, .end_index = start_index, .content = .{ .array_item_start = {} } }; | |
} | |
if (self.next_output_token) |token| { | |
self.next_output_token = null; | |
return token; | |
} | |
if (self.array_item_expected) { | |
self.array_item_expected = false; | |
self.array_item_starts_left += 1; | |
} | |
var input_token: lexer.YamlLexerToken = undefined; | |
if (self.next_input_token) |next_input_token| { | |
input_token = next_input_token; | |
self.next_input_token = null; | |
// std.debug.print("IN token (sn): {any}\n", .{input_token}); | |
} else { | |
input_token = try self.nextInputToken() orelse { | |
return self.getFinalToken(); | |
}; | |
// std.debug.print("IN token (s): {any}\n", .{input_token}); | |
} | |
if (was_file_start) { | |
while (input_token.content == .comment or input_token.content == .newline) { | |
input_token = try self.nextInputToken() orelse { | |
return self.getFinalToken(); | |
}; | |
} | |
} | |
var has_array_item = false; | |
var array_item_index: YamlParserTokenIndex = 0; | |
var has_array_start = false; | |
var has_object_start = false; | |
var empty_object = false; | |
while (true) { | |
const was_object_start = input_token.content == .object_start; | |
if (input_token.content == .newline) { | |
has_array_item = false; | |
array_item_index = 0; | |
has_object_start = false; | |
has_array_start = false; | |
const result = try self.parseNewLine(input_token); | |
if (result.next_input_token) |next_input_token| { | |
input_token = next_input_token; | |
} | |
if (result.output_token) |output_token| { | |
return output_token; | |
} | |
} else if (input_token.content == .whitespace) { | |
input_token = try self.nextInputToken() orelse { | |
if (self.block_nesting.bit_stack.size > 0) { | |
return YamlParserError.unexpected_end; | |
} else { | |
return self.getFinalToken(); | |
} | |
}; | |
// std.debug.print("IN token (w): {any}\n", .{input_token}); | |
} else if (input_token.content == .object_start) { | |
has_object_start = true; | |
empty_object = true; | |
// skip these, lexer already keeps track of relevant data | |
input_token = try self.nextInputToken() orelse { | |
return self.getFinalToken(); | |
}; | |
// std.debug.print("IN token (os): {any}\n", .{input_token}); | |
} else if (input_token.content == .object_end) { | |
if (empty_object) { | |
empty_object = false; | |
input_token = try self.nextInputToken() orelse { | |
return self.getFinalToken(); | |
}; | |
} else { | |
break; | |
} | |
// std.debug.print("IN token (oe): {any}\n", .{input_token}); | |
} else if (input_token.content == .array_start) { | |
has_array_start = true; | |
input_token = try self.nextInputToken() orelse { | |
return self.getFinalToken(); | |
}; | |
self.array_item_starts_left += 1; | |
// std.debug.print("IN token (as): {any}\n", .{input_token}); | |
} else if (input_token.content == .array_item) { | |
if (has_array_item or has_array_start or has_object_start) { | |
return YamlParserError.unexpected_token; | |
} | |
has_array_item = true; | |
array_item_index = input_token.start_index; | |
input_token = try self.nextInputToken() orelse { | |
return self.getFinalToken(); | |
}; | |
// std.debug.print("IN token (ai): {any}\n", .{input_token}); | |
} else { | |
break; | |
} | |
if (!was_object_start) { | |
empty_object = false; | |
} | |
} | |
var output_token: ?YamlParserToken = null; | |
const was_new_line = self.is_new_line; | |
self.is_new_line = false; | |
if (was_new_line) { | |
if (input_token.content == .identifier) { | |
if (has_array_item) { | |
try self.block_nesting.push(.array); | |
if (!has_object_start) { | |
try self.appendLeadingWhitespace(" "); | |
} | |
} | |
if (!has_object_start) { | |
try self.block_nesting.push(.object); | |
} | |
} else if (has_array_item) { | |
try self.block_nesting.push(.array); | |
} | |
} | |
if (input_token.content == .identifier) { | |
if (!has_array_item) { | |
_ = try self.expectNextToken(.colon); | |
} | |
output_token = .{ | |
.start_index = input_token.start_index, | |
.end_index = input_token.end_index, | |
.content = .{ .object_entry_start = {} }, | |
}; | |
} else if (input_token.content == .string_single_line) { | |
output_token = .{ | |
.start_index = input_token.start_index, | |
.end_index = input_token.end_index, | |
.content = .{ .string_single_line = .{ .has_escapes = input_token.content.string_single_line.has_escapes } }, | |
}; | |
} else if (input_token.content == .integer) { | |
output_token = .{ | |
.start_index = input_token.start_index, | |
.end_index = input_token.end_index, | |
.content = .{ .integer = input_token.content.integer }, | |
}; | |
} else if (input_token.content == .float) { | |
output_token = .{ | |
.start_index = input_token.start_index, | |
.end_index = input_token.end_index, | |
.content = .{ .float = input_token.content.float }, | |
}; | |
} else if (input_token.content == .bool) { | |
output_token = .{ | |
.start_index = input_token.start_index, | |
.end_index = input_token.end_index, | |
.content = .{ .bool = input_token.content.bool }, | |
}; | |
} else if (input_token.content == .comma) { | |
const lexer_nesting = self.lexer_iterator.inline_nesting.get() orelse { | |
return YamlParserError.unexpected_token; | |
}; | |
output_token = .{ | |
.start_index = input_token.start_index, | |
.end_index = input_token.end_index, | |
.content = switch (lexer_nesting) { | |
.object => .{ .object_entry_end = {} }, | |
.array => .{ .array_item_end = {} }, | |
}, | |
}; | |
if (lexer_nesting == .array) { | |
self.array_item_expected = true; | |
} | |
switch (lexer_nesting) { | |
.object => { | |
output_token = .{ | |
.start_index = input_token.start_index, | |
.end_index = input_token.end_index, | |
.content = .{ .object_entry_end = {} }, | |
}; | |
}, | |
.array => {}, | |
} | |
} else if (input_token.content == .object_end) { | |
output_token = .{ | |
.start_index = input_token.start_index, | |
.end_index = input_token.end_index, | |
.content = .{ .object_entry_end = {} }, | |
}; | |
} else if (input_token.content == .array_end) { | |
output_token = .{ | |
.start_index = input_token.start_index, | |
.end_index = input_token.end_index, | |
.content = .{ .array_item_end = {} }, | |
}; | |
} | |
if (output_token == null) { | |
return YamlParserError.unexpected_token; | |
} | |
if (has_array_item) { | |
self.next_input_token = input_token; | |
return .{ .start_index = output_token.?.start_index, .end_index = output_token.?.start_index, .content = .{ .array_item_start = {} } }; | |
} | |
if (self.array_item_starts_left > 0) { | |
self.next_output_token = output_token; | |
self.array_item_starts_left -= 1; | |
return .{ .start_index = output_token.?.start_index, .end_index = output_token.?.start_index, .content = .{ .array_item_start = {} } }; | |
} | |
return output_token; | |
} | |
fn parseNewLine(self: *YamlParserIterator, orig_input_token: lexer.YamlLexerToken) !ParseNewLineOutput { | |
var input_token = orig_input_token; | |
var maybe_next_input_token: ?lexer.YamlLexerToken = input_token; | |
var maybe_leading_whitespace_token: ?lexer.YamlLexerToken = null; | |
self.is_new_line = true; | |
var is_last_line = false; | |
while (maybe_next_input_token) |next_input_token| { | |
if (next_input_token.content == .whitespace) { | |
maybe_leading_whitespace_token = next_input_token; | |
} else if (next_input_token.content == .newline) { | |
maybe_leading_whitespace_token = null; | |
self.last_newline_index = input_token.start_index; | |
self.last_newline_length = @intCast(input_token.end_index - input_token.start_index); | |
} else if (next_input_token.content != .comment) { | |
break; | |
} | |
maybe_next_input_token = try self.nextInputToken(); | |
// std.debug.print("IN token (n): {any}\n", .{maybe_next_input_token}); | |
if (maybe_next_input_token == null) { | |
is_last_line = true; | |
break; | |
} | |
} | |
const prev_indent_level = self.indentation_level; | |
if (maybe_leading_whitespace_token) |leading_whitespace_token| { | |
if (leading_whitespace_token.content == .whitespace) { | |
try self.detectLeadingWhitespace(self.source[leading_whitespace_token.start_index..leading_whitespace_token.end_index]); | |
} else { | |
try self.detectLeadingWhitespace(""); | |
} | |
} else { | |
try self.detectLeadingWhitespace(""); | |
} | |
// std.debug.print("{d} -> {d}\n", .{ prev_indent_level, self.indentation_level }); | |
if (self.indentation_level <= prev_indent_level) { | |
if (self.indentation_level < prev_indent_level) { | |
self.outdents_left = prev_indent_level - self.indentation_level; | |
} | |
if (!is_last_line) { | |
if (maybe_next_input_token) |next_input_token| { | |
self.next_input_token = next_input_token; | |
} | |
} | |
const block_type = self.block_nesting.get() orelse { | |
return YamlParserError.invalid_outdent; | |
}; | |
try self.block_nesting.pop(null); | |
return .{ .output_token = .{ | |
.start_index = self.last_newline_index, | |
.end_index = self.last_newline_index + self.last_newline_length, | |
.content = switch (block_type) { | |
.object => .{ .object_entry_end = {} }, | |
.array => .{ .array_item_end = {} }, | |
}, | |
} }; | |
} else if (maybe_next_input_token) |next_input_token| { | |
input_token = next_input_token; | |
} else { | |
return .{ .output_token = try self.getFinalToken() }; | |
} | |
return .{ | |
.next_input_token = input_token, | |
}; | |
} | |
fn nextInputToken(self: *YamlParserIterator) !?lexer.YamlLexerToken { | |
var timer = try std.time.Timer.start(); | |
const input_token = try self.lexer_iterator.next(); | |
self.lexer_time += timer.read(); | |
if (input_token != null) { | |
self.last_input_token = input_token; | |
} | |
return input_token; | |
} | |
fn getFinalToken(self: *YamlParserIterator) !?YamlParserToken { | |
if (self.block_nesting.get()) |nesting| { | |
if (self.last_input_token) |last_input_token| { | |
try self.block_nesting.pop(null); | |
return .{ | |
.start_index = last_input_token.end_index, | |
.end_index = last_input_token.end_index, | |
.content = switch (nesting) { | |
.object => .object_entry_end, | |
.array => .array_item_end, | |
}, | |
}; | |
} | |
} | |
return null; | |
} | |
pub fn expectNextToken(self: *YamlParserIterator, maybe_expected: ?lexer.YamlLexerTokenType) !lexer.YamlLexerToken { | |
const token = try self.nextInputToken() orelse { | |
return YamlParserError.unexpected_end; | |
}; | |
if (maybe_expected) |expected| { | |
// std.debug.print("expect next ({any}): {any}\n", .{ expected, token }); | |
if (token.content != expected) { | |
return YamlParserError.unexpected_token; | |
} | |
} | |
return token; | |
} | |
pub fn appendLeadingWhitespace(self: *YamlParserIterator, str: []const u8) !void { | |
var next_whitespace_chars: u64 = self.leading_whitespace_chars; | |
for (str, 0..) |char, index| { | |
std.debug.assert(char == ' ' or char == '\t'); | |
const bit: u64 = @intFromBool(char == '\t'); | |
const shift = @as(u6, @intCast(index)) + self.leading_whitespace_size; | |
next_whitespace_chars = next_whitespace_chars | (bit << shift); | |
} | |
const indentation_diff: u6 = @intCast(str.len); | |
self.leading_whitespace_levels = self.leading_whitespace_levels >> indentation_diff; | |
self.leading_whitespace_levels = self.leading_whitespace_levels | (most_significant_bit >> (indentation_diff - 1)); | |
self.leading_whitespace_chars = next_whitespace_chars; | |
self.leading_whitespace_size += indentation_diff; | |
self.indentation_level += 1; | |
} | |
pub fn detectLeadingWhitespace(self: *YamlParserIterator, next_whitespace_string: []const u8) !void { | |
std.debug.assert(next_whitespace_string.len < 64); | |
const next_whitespace_size: u6 = @intCast(next_whitespace_string.len); | |
var next_whitespace_chars: u64 = 0; | |
for (next_whitespace_string, 0..) |char, index| { | |
std.debug.assert(char == ' ' or char == '\t'); | |
const bit: u64 = @intFromBool(char == '\t'); | |
next_whitespace_chars = next_whitespace_chars | (bit << @as(u6, @intCast(index))); | |
} | |
if (next_whitespace_size > self.leading_whitespace_size) { | |
const common_mask = leastSignificantBits(self.leading_whitespace_size); | |
if (self.leading_whitespace_chars & common_mask != next_whitespace_chars & common_mask) { | |
return YamlParserError.invalid_indent; | |
} | |
self.indentation_level += 1; | |
const indentation_diff: u6 = next_whitespace_size - self.leading_whitespace_size; | |
self.leading_whitespace_levels = self.leading_whitespace_levels >> indentation_diff; | |
self.leading_whitespace_levels = self.leading_whitespace_levels | (most_significant_bit >> (indentation_diff - 1)); | |
} else if (next_whitespace_size < self.leading_whitespace_size) { | |
const common_mask = leastSignificantBits(next_whitespace_size); | |
if (self.leading_whitespace_chars & common_mask != next_whitespace_chars & common_mask) { | |
return YamlParserError.invalid_outdent; | |
} | |
var indentation_diff: u6 = self.leading_whitespace_size - next_whitespace_size; | |
var next_indentation_level = self.indentation_level; | |
var next_levels = self.leading_whitespace_levels; | |
while (true) { | |
const expected_outdent: u6 = @intCast(@clz(next_levels) + 1); | |
next_levels = next_levels << @intCast(@clz(next_levels) + 1); | |
if (indentation_diff >= expected_outdent) { | |
next_indentation_level -= 1; | |
indentation_diff -= expected_outdent; | |
if (indentation_diff == 0) { | |
break; | |
} else { | |
if (next_levels == 0) { | |
return YamlParserError.invalid_outdent; | |
} | |
} | |
} else { | |
return YamlParserError.invalid_outdent; | |
} | |
} | |
self.leading_whitespace_levels = next_levels; | |
self.indentation_level = next_indentation_level; | |
} else { | |
const common_mask = leastSignificantBits(self.leading_whitespace_size); | |
if (self.leading_whitespace_chars & common_mask != next_whitespace_chars & common_mask) { | |
return YamlParserError.invalid_indentation; | |
} | |
} | |
// self.leading_whitespace_levels |= level_bit; | |
self.leading_whitespace_size = next_whitespace_size; | |
self.leading_whitespace_chars = next_whitespace_chars; | |
} | |
}; | |
fn leastSignificantBits(n: u6) u64 { | |
if (n >= 64) return 0xFFFFFFFFFFFFFFFF; | |
return (@as(u64, 1) << n) - 1; | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const std = @import("std"); | |
pub const StatickBitStackError = error{ | |
exceeded_capacity, | |
popped_empty, | |
}; | |
pub const StaticBitStack = struct { | |
stack: u64 = 0, | |
size: u6 = 0, | |
pub fn push(self: *StaticBitStack, value: u1) !void { | |
if (self.size == 63) { | |
return StatickBitStackError.exceeded_capacity; | |
} | |
self.size += 1; | |
self.stack = self.stack << 1; | |
if (self.stack & 1 != value) { | |
self.stack ^= 1; | |
} | |
} | |
pub fn get(self: StaticBitStack) ?u1 { | |
if (self.size > 0) { | |
return @intCast(self.stack & 1); | |
} else { | |
return null; | |
} | |
} | |
pub fn pop(self: *StaticBitStack) !void { | |
if (self.size == 0) { | |
return StatickBitStackError.popped_empty; | |
} | |
self.size -= 1; | |
self.stack = self.stack >> 1; | |
} | |
}; | |
test "BitStack" { | |
var bit_stack = StaticBitStack{}; | |
try bit_stack.push(1); | |
try bit_stack.push(0); | |
try bit_stack.push(0); | |
try bit_stack.push(1); | |
try std.testing.expectEqual(1, bit_stack.get()); | |
try bit_stack.pop(); | |
try std.testing.expectEqual(0, bit_stack.get()); | |
try bit_stack.pop(); | |
try std.testing.expectEqual(0, bit_stack.get()); | |
try bit_stack.pop(); | |
try std.testing.expectEqual(1, bit_stack.get()); | |
try bit_stack.pop(); | |
try std.testing.expectEqual(null, bit_stack.get()); | |
try bit_stack.push(1); | |
try std.testing.expectEqual(1, bit_stack.get()); | |
try bit_stack.pop(); | |
try std.testing.expectEqual(null, bit_stack.get()); | |
try bit_stack.push(0); | |
try std.testing.expectEqual(0, bit_stack.get()); | |
try bit_stack.pop(); | |
try std.testing.expectEqual(null, bit_stack.get()); | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const std = @import("std"); | |
const lexer = @import("yaml/lexer.zig"); | |
const testing = @import("utils/testing.zig"); | |
const pretty = @import("external/pretty.zig"); | |
const ShortExpectedLexerToken = struct { | |
length: lexer.YamlLexerTokenIndex, | |
content: lexer.YamlLexerTokenContent, | |
start_offset: lexer.YamlLexerTokenIndex = 0, | |
end_offset: lexer.YamlLexerTokenIndex = 0, | |
}; | |
fn testLexing(short_expected: []const ShortExpectedLexerToken, source: []const u8) !void { | |
var arena = std.heap.ArenaAllocator.init(std.testing.allocator); | |
const allocator = arena.allocator(); | |
defer arena.deinit(); | |
var expected = lexer.YamlLexerTokenList.init(allocator); | |
var start_index: lexer.YamlLexerTokenIndex = 0; | |
for (short_expected) |short| { | |
start_index += short.start_offset; | |
try expected.append(.{ | |
.start_index = start_index, | |
.end_index = start_index + short.length, | |
.content = short.content, | |
}); | |
start_index += short.length; | |
start_index += short.end_offset; | |
} | |
var tokenizer = lexer.YamlLexerTokenInterator.init(source); | |
var tokens = lexer.YamlLexerTokenList.init(allocator); | |
var parsing_time: u64 = 0; | |
var timer = try std.time.Timer.start(); | |
while (try tokenizer.next()) |token| { | |
parsing_time += timer.read(); | |
try tokens.append(token); | |
timer.reset(); | |
} | |
// std.debug.print("parsing time: {d}ns\n", .{parsing_time}); | |
std.testing.expectEqualDeep(expected.items, tokens.items) catch |err| { | |
var slices = std.ArrayList([]const u8).init(allocator); | |
for (tokens.items) |token| { | |
try slices.append(source[token.start_index..token.end_index]); | |
} | |
std.debug.print("\nactual slices:\n\n", .{}); | |
try pretty.print(allocator, slices.items, .{}); | |
std.debug.print("\nactual:\n\n", .{}); | |
try pretty.print(allocator, tokens.items, .{}); | |
std.debug.print("\nexpected:\n\n", .{}); | |
try pretty.print(allocator, expected.items, .{}); | |
std.debug.print("\n", .{}); | |
return err; | |
}; | |
} | |
test "lexer: top-level true" { | |
try testLexing(&.{ | |
.{ .length = 4, .content = .{ .bool = true } }, | |
}, "true"); | |
} | |
test "lexer: top-level false" { | |
try testLexing(&.{ | |
.{ .length = 5, .content = .{ .bool = false } }, | |
}, "false"); | |
} | |
test "lexer: top-level one-entry object" { | |
try testLexing(&.{ | |
.{ .length = 3, .content = .{ .identifier = {} } }, | |
.{ .length = 1, .content = .{ .colon = {} } }, | |
.{ .length = 1, .content = .{ .whitespace = {} } }, | |
.{ .length = 5, .content = .{ .bool = false } }, | |
}, "key: false"); | |
} | |
test "lexer: top-level two-entry object" { | |
try testLexing(&.{ | |
.{ .length = 3, .content = .{ .identifier = {} } }, | |
.{ .length = 1, .content = .{ .colon = {} } }, | |
.{ .length = 1, .content = .{ .whitespace = {} } }, | |
.{ .length = 5, .content = .{ .bool = false } }, | |
.{ .length = 1, .content = .{ .newline = {} } }, | |
.{ .length = 4, .content = .{ .identifier = {} } }, | |
.{ .length = 1, .content = .{ .colon = {} } }, | |
.{ .length = 1, .content = .{ .whitespace = {} } }, | |
.{ .length = 4, .content = .{ .bool = true } }, | |
}, "key: false\nkey2: true"); | |
} | |
test "lexer: nested objects" { | |
const source = | |
\\parent: | |
\\ child: true | |
\\key: false | |
; | |
try testLexing(&.{ | |
.{ .length = 6, .content = .{ .identifier = {} } }, | |
.{ .length = 1, .content = .{ .colon = {} } }, | |
.{ .length = 1, .content = .{ .newline = {} } }, | |
.{ .length = 2, .content = .{ .whitespace = {} } }, | |
.{ .length = 5, .content = .{ .identifier = {} } }, | |
.{ .length = 1, .content = .{ .colon = {} } }, | |
.{ .length = 1, .content = .{ .whitespace = {} } }, | |
.{ .length = 4, .content = .{ .bool = true } }, | |
.{ .length = 1, .content = .{ .newline = {} } }, | |
.{ .length = 3, .content = .{ .identifier = {} } }, | |
.{ .length = 1, .content = .{ .colon = {} } }, | |
.{ .length = 1, .content = .{ .whitespace = {} } }, | |
.{ .length = 5, .content = .{ .bool = false } }, | |
}, source); | |
} | |
test "lexer: single-line strings" { | |
const source = | |
\\key1: "test key1 value" | |
\\key2: 'test key2 value' | |
\\key3: "test\"key3 value" | |
\\key4: 'test\'key4 value' | |
\\key5: test key5 value | |
; | |
try testLexing(&.{ | |
.{ .length = 4, .content = .{ .identifier = {} } }, | |
.{ .length = 1, .content = .{ .colon = {} } }, | |
.{ .length = 1, .content = .{ .whitespace = {} } }, | |
.{ .length = 15, .start_offset = 1, .end_offset = 1, .content = .{ .string_single_line = .{ .has_escapes = false } } }, | |
.{ .length = 1, .content = .{ .newline = {} } }, | |
.{ .length = 4, .content = .{ .identifier = {} } }, | |
.{ .length = 1, .content = .{ .colon = {} } }, | |
.{ .length = 1, .content = .{ .whitespace = {} } }, | |
.{ .length = 15, .start_offset = 1, .end_offset = 1, .content = .{ .string_single_line = .{ .has_escapes = false } } }, | |
.{ .length = 1, .content = .{ .newline = {} } }, | |
.{ .length = 4, .content = .{ .identifier = {} } }, | |
.{ .length = 1, .content = .{ .colon = {} } }, | |
.{ .length = 1, .content = .{ .whitespace = {} } }, | |
.{ .length = 16, .start_offset = 1, .end_offset = 1, .content = .{ .string_single_line = .{ .has_escapes = true } } }, | |
.{ .length = 1, .content = .{ .newline = {} } }, | |
.{ .length = 4, .content = .{ .identifier = {} } }, | |
.{ .length = 1, .content = .{ .colon = {} } }, | |
.{ .length = 1, .content = .{ .whitespace = {} } }, | |
.{ .length = 16, .start_offset = 1, .end_offset = 1, .content = .{ .string_single_line = .{ .has_escapes = true } } }, | |
.{ .length = 1, .content = .{ .newline = {} } }, | |
.{ .length = 4, .content = .{ .identifier = {} } }, | |
.{ .length = 1, .content = .{ .colon = {} } }, | |
.{ .length = 1, .content = .{ .whitespace = {} } }, | |
.{ .length = 15, .content = .{ .string_single_line = .{ .has_escapes = false } } }, | |
}, source); | |
} | |
test "lexer: numbers" { | |
const source = | |
\\int: 221 | |
\\flt: 23.12 | |
; | |
try testLexing(&.{ | |
.{ .length = 3, .content = .{ .identifier = {} } }, | |
.{ .length = 1, .content = .{ .colon = {} } }, | |
.{ .length = 1, .content = .{ .whitespace = {} } }, | |
.{ .length = 3, .content = .{ .integer = 221 } }, | |
.{ .length = 1, .content = .{ .newline = {} } }, | |
.{ .length = 3, .content = .{ .identifier = {} } }, | |
.{ .length = 1, .content = .{ .colon = {} } }, | |
.{ .length = 1, .content = .{ .whitespace = {} } }, | |
.{ .length = 5, .content = .{ .float = 23.12 } }, | |
}, source); | |
} | |
test "lexer: inline object" { | |
const source = | |
\\obj: { key1: true, key2: value2 space } | |
; | |
try testLexing(&.{ | |
.{ .length = 3, .content = .{ .identifier = {} } }, | |
.{ .length = 1, .content = .{ .colon = {} } }, | |
.{ .length = 1, .content = .{ .whitespace = {} } }, | |
.{ .length = 1, .content = .{ .object_start = {} } }, | |
.{ .length = 1, .content = .{ .whitespace = {} } }, | |
.{ .length = 4, .content = .{ .identifier = {} } }, | |
.{ .length = 1, .content = .{ .colon = {} } }, | |
.{ .length = 1, .content = .{ .whitespace = {} } }, | |
.{ .length = 4, .content = .{ .bool = true } }, | |
.{ .length = 1, .content = .{ .comma = {} } }, | |
.{ .length = 1, .content = .{ .whitespace = {} } }, | |
.{ .length = 4, .content = .{ .identifier = {} } }, | |
.{ .length = 1, .content = .{ .colon = {} } }, | |
.{ .length = 1, .content = .{ .whitespace = {} } }, | |
.{ .length = 12, .content = .{ .string_single_line = .{ .has_escapes = false } } }, | |
.{ .length = 1, .content = .{ .whitespace = {} } }, | |
.{ .length = 1, .content = .{ .object_end = {} } }, | |
}, source); | |
} | |
test "lexer: inline array" { | |
const source = | |
\\obj: [ value1, value2 ] | |
; | |
try testLexing(&.{ | |
.{ .length = 3, .content = .{ .identifier = {} } }, | |
.{ .length = 1, .content = .{ .colon = {} } }, | |
.{ .length = 1, .content = .{ .whitespace = {} } }, | |
.{ .length = 1, .content = .{ .array_start = {} } }, | |
.{ .length = 1, .content = .{ .whitespace = {} } }, | |
.{ .length = 6, .content = .{ .string_single_line = .{ .has_escapes = false } } }, | |
.{ .length = 1, .content = .{ .comma = {} } }, | |
.{ .length = 1, .content = .{ .whitespace = {} } }, | |
.{ .length = 6, .content = .{ .string_single_line = .{ .has_escapes = false } } }, | |
.{ .length = 1, .content = .{ .whitespace = {} } }, | |
.{ .length = 1, .content = .{ .array_end = {} } }, | |
}, source); | |
} | |
test "lexer: list array" { | |
const source = | |
\\obj: | |
\\ - value1 | |
\\ - true | |
\\ - key: 12 | |
\\ - key: test test | |
; | |
try testLexing(&.{ | |
.{ .length = 3, .content = .{ .identifier = {} } }, | |
.{ .length = 1, .content = .{ .colon = {} } }, | |
.{ .length = 1, .content = .{ .newline = {} } }, | |
.{ .length = 2, .content = .{ .whitespace = {} } }, | |
.{ .length = 1, .content = .{ .array_item = {} } }, | |
.{ .length = 1, .content = .{ .whitespace = {} } }, | |
.{ .length = 6, .content = .{ .string_single_line = .{ .has_escapes = false } } }, | |
.{ .length = 1, .content = .{ .newline = {} } }, | |
.{ .length = 2, .content = .{ .whitespace = {} } }, | |
.{ .length = 1, .content = .{ .array_item = {} } }, | |
.{ .length = 1, .content = .{ .whitespace = {} } }, | |
.{ .length = 4, .content = .{ .bool = true } }, | |
.{ .length = 1, .content = .{ .newline = {} } }, | |
.{ .length = 2, .content = .{ .whitespace = {} } }, | |
.{ .length = 1, .content = .{ .array_item = {} } }, | |
.{ .length = 1, .content = .{ .whitespace = {} } }, | |
.{ .length = 3, .content = .{ .identifier = {} } }, | |
.{ .length = 1, .content = .{ .colon = {} } }, | |
.{ .length = 1, .content = .{ .whitespace = {} } }, | |
.{ .length = 2, .content = .{ .integer = 12 } }, | |
.{ .length = 1, .content = .{ .newline = {} } }, | |
.{ .length = 2, .content = .{ .whitespace = {} } }, | |
.{ .length = 1, .content = .{ .array_item = {} } }, | |
.{ .length = 1, .content = .{ .whitespace = {} } }, | |
.{ .length = 3, .content = .{ .identifier = {} } }, | |
.{ .length = 1, .content = .{ .colon = {} } }, | |
.{ .length = 1, .content = .{ .whitespace = {} } }, | |
.{ .length = 9, .content = .{ .string_single_line = .{ .has_escapes = false } } }, | |
}, source); | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const std = @import("std"); | |
const lexer = @import("yaml/lexer.zig"); | |
const parser = @import("yaml/parser.zig"); | |
const testing = @import("utils/testing.zig"); | |
const pretty = @import("external/pretty.zig"); | |
const ShortExpectedParserToken = struct { | |
length: i16, | |
content: parser.YamlParserTokenContent, | |
start_offset: i8, | |
end_offset: i8 = 0, | |
}; | |
fn testParsing(short_expected: []const ShortExpectedParserToken, source: []const u8) !void { | |
var arena = std.heap.ArenaAllocator.init(std.testing.allocator); | |
const allocator = arena.allocator(); | |
defer arena.deinit(); | |
var expected = parser.YamlParserTokenList.init(allocator); | |
var expected_start_index: i16 = 0; | |
for (short_expected) |short| { | |
expected_start_index += @intCast(short.start_offset); | |
try expected.append(.{ | |
.start_index = @intCast(expected_start_index), | |
.end_index = @intCast(expected_start_index + short.length), | |
.content = short.content, | |
}); | |
expected_start_index += @intCast(short.length); | |
expected_start_index += @intCast(short.end_offset); | |
} | |
var tokenizer = parser.YamlParserIterator.init(source, lexer.YamlLexerTokenInterator.init(source)); | |
var tokens = parser.YamlParserTokenList.init(allocator); | |
var parsing_time: u64 = 0; | |
var timer = try std.time.Timer.start(); | |
while (try tokenizer.next()) |token| { | |
parsing_time += timer.read(); | |
try tokens.append(token); | |
// const text = source[token.start_index..token.end_index]; | |
// std.debug.print("OUT token {s}: \"{s}\"\n\n", .{ | |
// @tagName(token.content), | |
// if (text.len == 1 and text[0] == '\n') "\\n" else text, | |
// }); | |
} | |
// std.debug.print("parsing time: {d}ns\n", .{parsing_time}); | |
std.testing.expectEqualDeep(expected.items, tokens.items) catch |err| { | |
var slices = std.ArrayList([]const u8).init(allocator); | |
for (tokens.items) |token| { | |
const text = source[token.start_index..token.end_index]; | |
if (text.len == 1 and text[0] == '\n') { | |
try slices.append(try std.fmt.allocPrint( | |
allocator, | |
"\\n ({s})", | |
.{@tagName(token.content)}, | |
)); | |
} else { | |
try slices.append(try std.fmt.allocPrint( | |
allocator, | |
"{s} ({s})", | |
.{ text, @tagName(token.content) }, | |
)); | |
} | |
} | |
try pretty.print(allocator, slices.items, .{}); | |
std.debug.print("\nactual:\n\n", .{}); | |
try pretty.print(allocator, tokens.items, .{}); | |
std.debug.print("\nexpected:\n\n", .{}); | |
try pretty.print(allocator, expected.items, .{}); | |
std.debug.print("\n", .{}); | |
var short_actual = std.ArrayList(ShortExpectedParserToken).init(allocator); | |
var actual_start_index: i32 = 0; | |
for (tokens.items) |token| { | |
try short_actual.append(.{ | |
.start_offset = @intCast(@as(i32, @intCast(token.start_index)) - actual_start_index), | |
.length = @intCast(token.end_index - token.start_index), | |
.content = token.content, | |
}); | |
actual_start_index = @intCast(token.end_index); | |
} | |
std.debug.print("\nactual short: {any}\n", .{short_actual.items}); | |
return err; | |
}; | |
} | |
test "parser: nesting detection" { | |
var arena = std.heap.ArenaAllocator.init(std.testing.allocator); | |
const allocator = arena.allocator(); | |
_ = allocator; // autofix | |
defer arena.deinit(); | |
var tokenizer = parser.YamlParserIterator.init("", lexer.YamlLexerTokenInterator.init("")); | |
try tokenizer.detectLeadingWhitespace(""); | |
try std.testing.expectEqual(0, tokenizer.leading_whitespace_chars); | |
try std.testing.expectEqual(0, tokenizer.leading_whitespace_size); | |
try std.testing.expectEqual(0b0, tokenizer.leading_whitespace_levels); | |
try std.testing.expectEqual(0, tokenizer.indentation_level); | |
try tokenizer.detectLeadingWhitespace(" \t"); | |
try std.testing.expectEqual(0b10, tokenizer.leading_whitespace_chars); | |
try std.testing.expectEqual(2, tokenizer.leading_whitespace_size); | |
try std.testing.expectEqual(0b0100000000000000000000000000000000000000000000000000000000000000, tokenizer.leading_whitespace_levels); | |
try std.testing.expectEqual(1, tokenizer.indentation_level); | |
try tokenizer.detectLeadingWhitespace(" \t \t"); | |
try std.testing.expectEqual(0b10010, tokenizer.leading_whitespace_chars); | |
try std.testing.expectEqual(5, tokenizer.leading_whitespace_size); | |
try std.testing.expectEqual(0b0010100000000000000000000000000000000000000000000000000000000000, tokenizer.leading_whitespace_levels); | |
try std.testing.expectEqual(2, tokenizer.indentation_level); | |
try std.testing.expectError( | |
parser.YamlParserError.invalid_indent, | |
tokenizer.detectLeadingWhitespace(" "), | |
); | |
try std.testing.expectError( | |
parser.YamlParserError.invalid_outdent, | |
tokenizer.detectLeadingWhitespace(" \t "), | |
); | |
try std.testing.expectError( | |
parser.YamlParserError.invalid_indentation, | |
tokenizer.detectLeadingWhitespace(" "), | |
); | |
try tokenizer.detectLeadingWhitespace(" \t"); | |
try std.testing.expectEqual(0b10, tokenizer.leading_whitespace_chars); | |
try std.testing.expectEqual(2, tokenizer.leading_whitespace_size); | |
try std.testing.expectEqual(0b0100000000000000000000000000000000000000000000000000000000000000, tokenizer.leading_whitespace_levels); | |
try std.testing.expectEqual(1, tokenizer.indentation_level); | |
try tokenizer.detectLeadingWhitespace(" \t \t"); | |
try tokenizer.detectLeadingWhitespace(" \t"); | |
// std.debug.print("\nact:{s}\nexp:{s}\n", .{ | |
// try fmt.allocFormatBits(allocator, tokenizer.leading_whitespace_levels), | |
// try fmt.allocFormatBits(allocator, 0b0100000000000000000000000000000000000000000000000000000000000000), | |
// }); | |
try std.testing.expectEqual(0b10, tokenizer.leading_whitespace_chars); | |
try std.testing.expectEqual(2, tokenizer.leading_whitespace_size); | |
try std.testing.expectEqual(0b0100000000000000000000000000000000000000000000000000000000000000, tokenizer.leading_whitespace_levels); | |
try std.testing.expectEqual(1, tokenizer.indentation_level); | |
try tokenizer.appendLeadingWhitespace(" "); | |
try std.testing.expectEqual(0b0010, tokenizer.leading_whitespace_chars); | |
try std.testing.expectEqual(4, tokenizer.leading_whitespace_size); | |
try std.testing.expectEqual(0b0101000000000000000000000000000000000000000000000000000000000000, tokenizer.leading_whitespace_levels); | |
try std.testing.expectEqual(2, tokenizer.indentation_level); | |
} | |
// test "parser: block nesting" { | |
// var nesting = parser.YamlComplexNesting{}; | |
// try nesting.push(.array); | |
// try nesting.push(.object); | |
// try std.testing.expectEqual(.object, nesting.get()); | |
// try std.testing.expectEqual(false, nesting.pending_array_get); | |
// try std.testing.expectEqual(false, nesting.pending_object_pop); | |
// try nesting.pop(); | |
// try std.testing.expectEqual(.array, nesting.get()); | |
// try std.testing.expectEqual(false, nesting.pending_array_get); | |
// try std.testing.expectEqual(false, nesting.pending_object_pop); | |
// try nesting.pop(); | |
// try std.testing.expectEqual(null, nesting.get()); | |
// try std.testing.expectEqual(false, nesting.pending_array_get); | |
// try std.testing.expectEqual(false, nesting.pending_object_pop); | |
// try nesting.push(.object); | |
// try std.testing.expectEqual(false, nesting.pending_array_get); | |
// try std.testing.expectEqual(false, nesting.pending_object_pop); | |
// try nesting.push(.object_and_array); | |
// try std.testing.expectEqual(.object, nesting.get()); | |
// try std.testing.expectEqual(true, nesting.pending_array_get); | |
// try std.testing.expectEqual(true, nesting.pending_object_pop); | |
// try nesting.pop(); | |
// try std.testing.expectEqual(.array, nesting.get()); | |
// try std.testing.expectEqual(true, nesting.pending_array_get); | |
// try std.testing.expectEqual(false, nesting.pending_object_pop); | |
// try nesting.pop(); | |
// try std.testing.expectEqual(.object, nesting.get()); | |
// try nesting.pop(); | |
// try std.testing.expectEqual(null, nesting.get()); | |
// try std.testing.expectEqual(false, nesting.pending_array_get); | |
// try std.testing.expectEqual(false, nesting.pending_object_pop); | |
// try nesting.push(.object_and_array); | |
// try std.testing.expectEqual(.object, nesting.get()); | |
// try std.testing.expectEqual(true, nesting.pending_array_get); | |
// try std.testing.expectEqual(true, nesting.pending_object_pop); | |
// try nesting.push(.object); | |
// try std.testing.expectEqual(.object, nesting.get()); | |
// try std.testing.expectEqual(false, nesting.pending_array_get); | |
// try std.testing.expectEqual(false, nesting.pending_object_pop); | |
// try nesting.pop(); | |
// try std.testing.expectEqual(.object, nesting.get()); | |
// try std.testing.expectEqual(true, nesting.pending_array_get); | |
// try std.testing.expectEqual(true, nesting.pending_object_pop); | |
// try nesting.pop(); | |
// try std.testing.expectEqual(.array, nesting.get()); | |
// try std.testing.expectEqual(true, nesting.pending_array_get); | |
// try std.testing.expectEqual(false, nesting.pending_object_pop); | |
// } | |
test "parser: top-level true" { | |
try testParsing(&.{ | |
.{ .length = 4, .start_offset = 0, .content = .{ .bool = true } }, | |
}, "true"); | |
} | |
test "parser: top-level false" { | |
try testParsing(&.{ | |
.{ .length = 5, .start_offset = 0, .content = .{ .bool = false } }, | |
}, "false"); | |
} | |
test "parser: top-level one-entry object" { | |
try testParsing(&.{ | |
.{ .length = 3, .start_offset = 0, .content = .{ .object_entry_start = {} } }, | |
.{ .length = 5, .start_offset = 2, .content = .{ .bool = false } }, | |
.{ .length = 0, .start_offset = 0, .content = .{ .object_entry_end = {} } }, | |
}, "key: false"); | |
} | |
test "parser: top-level two-entry object" { | |
try testParsing(&.{ | |
.{ .length = 3, .start_offset = 0, .content = .{ .object_entry_start = {} } }, | |
.{ .length = 5, .start_offset = 2, .content = .{ .bool = false } }, | |
.{ .length = 1, .start_offset = 0, .content = .{ .object_entry_end = {} } }, | |
.{ .length = 4, .start_offset = 0, .content = .{ .object_entry_start = {} } }, | |
.{ .length = 4, .start_offset = 2, .content = .{ .bool = true } }, | |
.{ .length = 0, .start_offset = 0, .content = .{ .object_entry_end = {} } }, | |
}, "key: false\nkey2: true"); | |
} | |
test "parser: nested objects" { | |
const source = | |
\\parent: | |
\\ child: 0 | |
\\key: 1 | |
\\parent2: | |
\\ child2: | |
\\ child3: 2 | |
\\key2: 3 | |
; | |
try testParsing(&.{ | |
// parent - start | |
.{ .length = 6, .start_offset = 0, .content = .{ .object_entry_start = {} } }, | |
.{ .length = 5, .start_offset = 4, .content = .{ .object_entry_start = {} } }, | |
.{ .length = 1, .start_offset = 2, .content = .{ .integer = 0 } }, | |
.{ .length = 1, .start_offset = 0, .content = .{ .object_entry_end = {} } }, | |
// parent - end | |
.{ .length = 1, .start_offset = -1, .content = .{ .object_entry_end = {} } }, | |
// key | |
.{ .length = 3, .start_offset = 0, .content = .{ .object_entry_start = {} } }, | |
.{ .length = 1, .start_offset = 2, .content = .{ .integer = 1 } }, | |
.{ .length = 1, .start_offset = 0, .content = .{ .object_entry_end = {} } }, | |
// parent2 | |
.{ .length = 7, .start_offset = 0, .content = .{ .object_entry_start = {} } }, | |
// child2 | |
.{ .length = 6, .start_offset = 4, .content = .{ .object_entry_start = {} } }, | |
// child3 | |
.{ .length = 6, .start_offset = 6, .content = .{ .object_entry_start = {} } }, | |
.{ .length = 1, .start_offset = 2, .content = .{ .integer = 2 } }, | |
.{ .length = 1, .start_offset = 0, .content = .{ .object_entry_end = {} } }, | |
.{ .length = 1, .start_offset = -1, .content = .{ .object_entry_end = {} } }, | |
.{ .length = 1, .start_offset = -1, .content = .{ .object_entry_end = {} } }, | |
// key2 | |
.{ .length = 4, .start_offset = 0, .content = .{ .object_entry_start = {} } }, | |
.{ .length = 1, .start_offset = 2, .content = .{ .integer = 3 } }, | |
.{ .length = 0, .start_offset = 0, .content = .{ .object_entry_end = {} } }, | |
}, source); | |
} | |
test "parser: comments" { | |
const source = | |
\\parent: | |
\\ child: 0 | |
\\ # comment | |
\\# comment | |
\\key: 1 | |
\\parent2: | |
\\ child2: | |
\\ child3: 2 | |
\\key2: 3 | |
; | |
try testParsing(&.{ | |
// parent - start | |
.{ .length = 6, .start_offset = 0, .content = .{ .object_entry_start = {} } }, | |
.{ .length = 5, .start_offset = 4, .content = .{ .object_entry_start = {} } }, | |
.{ .length = 1, .start_offset = 2, .content = .{ .integer = 0 } }, | |
.{ .length = 1, .start_offset = 0, .content = .{ .object_entry_end = {} } }, | |
// parent - end | |
.{ .length = 1, .start_offset = -1, .content = .{ .object_entry_end = {} } }, | |
// key | |
.{ .length = 3, .start_offset = 22, .content = .{ .object_entry_start = {} } }, | |
.{ .length = 1, .start_offset = 2, .content = .{ .integer = 1 } }, | |
.{ .length = 1, .start_offset = 0, .content = .{ .object_entry_end = {} } }, | |
// parent2 | |
.{ .length = 7, .start_offset = 0, .content = .{ .object_entry_start = {} } }, | |
// child2 | |
.{ .length = 6, .start_offset = 4, .content = .{ .object_entry_start = {} } }, | |
// child3 | |
.{ .length = 6, .start_offset = 6, .content = .{ .object_entry_start = {} } }, | |
.{ .length = 1, .start_offset = 2, .content = .{ .integer = 2 } }, | |
.{ .length = 1, .start_offset = 0, .content = .{ .object_entry_end = {} } }, | |
.{ .length = 1, .start_offset = -1, .content = .{ .object_entry_end = {} } }, | |
.{ .length = 1, .start_offset = -1, .content = .{ .object_entry_end = {} } }, | |
// key2 | |
.{ .length = 4, .start_offset = 0, .content = .{ .object_entry_start = {} } }, | |
.{ .length = 1, .start_offset = 2, .content = .{ .integer = 3 } }, | |
.{ .length = 0, .start_offset = 0, .content = .{ .object_entry_end = {} } }, | |
}, source); | |
} | |
test "parser: single-line strings" { | |
const source = | |
\\key1: "test key1 value" | |
\\key2: 'test key2 value' | |
\\key3: "test\"key3 value" | |
\\key4: 'test\'key4 value' | |
\\key5: test key5 value | |
; | |
try testParsing(&.{ | |
.{ .length = 4, .start_offset = 0, .content = .{ .object_entry_start = {} } }, | |
.{ .length = 15, .start_offset = 3, .content = .{ .string_single_line = .{ .has_escapes = false } } }, | |
.{ .length = 1, .start_offset = 0, .content = .{ .object_entry_end = {} } }, | |
.{ .length = 4, .start_offset = 0, .content = .{ .object_entry_start = {} } }, | |
.{ .length = 15, .start_offset = 3, .content = .{ .string_single_line = .{ .has_escapes = false } } }, | |
.{ .length = 1, .start_offset = 0, .content = .{ .object_entry_end = {} } }, | |
.{ .length = 4, .start_offset = 0, .content = .{ .object_entry_start = {} } }, | |
.{ .length = 16, .start_offset = 3, .content = .{ .string_single_line = .{ .has_escapes = true } } }, | |
.{ .length = 1, .start_offset = 0, .content = .{ .object_entry_end = {} } }, | |
.{ .length = 4, .start_offset = 0, .content = .{ .object_entry_start = {} } }, | |
.{ .length = 16, .start_offset = 3, .content = .{ .string_single_line = .{ .has_escapes = true } } }, | |
.{ .length = 1, .start_offset = 0, .content = .{ .object_entry_end = {} } }, | |
.{ .length = 4, .start_offset = 0, .content = .{ .object_entry_start = {} } }, | |
.{ .length = 15, .start_offset = 2, .content = .{ .string_single_line = .{ .has_escapes = false } } }, | |
.{ .length = 0, .start_offset = 0, .content = .{ .object_entry_end = {} } }, | |
}, source); | |
} | |
test "parser: numbers" { | |
const source = | |
\\int: 221 | |
\\flt: 23.12 | |
; | |
try testParsing(&.{ | |
.{ .length = 3, .start_offset = 0, .content = .{ .object_entry_start = {} } }, | |
.{ .length = 3, .start_offset = 2, .content = .{ .integer = 221 } }, | |
.{ .length = 1, .start_offset = 0, .content = .{ .object_entry_end = {} } }, | |
.{ .length = 3, .start_offset = 0, .content = .{ .object_entry_start = {} } }, | |
.{ .length = 5, .start_offset = 2, .content = .{ .float = 23.12 } }, | |
.{ .length = 0, .start_offset = 0, .content = .{ .object_entry_end = {} } }, | |
}, source); | |
} | |
test "parser: inline object" { | |
const source = | |
\\obj: { key1: true, key2: value2 space } | |
; | |
try testParsing(&.{ | |
.{ .length = 3, .start_offset = 0, .content = .{ .object_entry_start = {} } }, | |
.{ .length = 4, .start_offset = 4, .content = .{ .object_entry_start = {} } }, | |
.{ .length = 4, .start_offset = 2, .content = .{ .bool = true } }, | |
.{ .length = 1, .start_offset = 0, .content = .{ .object_entry_end = {} } }, | |
.{ .length = 4, .start_offset = 1, .content = .{ .object_entry_start = {} } }, | |
.{ .length = 12, .start_offset = 2, .content = .{ .string_single_line = .{ .has_escapes = false } } }, | |
.{ .length = 1, .start_offset = 0, .content = .{ .object_entry_end = {} } }, | |
.{ .length = 0, .start_offset = 0, .content = .{ .object_entry_end = {} } }, | |
}, source); | |
} | |
test "parser: inline array" { | |
const source = | |
\\obj: [ value1, true, [123, test test] ] | |
; | |
try testParsing(&.{ | |
.{ .length = 3, .start_offset = 0, .content = .{ .object_entry_start = {} } }, | |
.{ .length = 0, .start_offset = 4, .content = .{ .array_item_start = {} } }, | |
.{ .length = 6, .start_offset = 0, .content = .{ .string_single_line = .{ .has_escapes = false } } }, | |
.{ .length = 1, .start_offset = 0, .content = .{ .array_item_end = {} } }, | |
.{ .length = 0, .start_offset = 1, .content = .{ .array_item_start = {} } }, | |
.{ .length = 4, .start_offset = 0, .content = .{ .bool = true } }, | |
.{ .length = 1, .start_offset = 0, .content = .{ .array_item_end = {} } }, | |
.{ .length = 0, .start_offset = 2, .content = .{ .array_item_start = {} } }, | |
.{ .length = 0, .start_offset = 0, .content = .{ .array_item_start = {} } }, | |
.{ .length = 3, .start_offset = 0, .content = .{ .integer = 123 } }, | |
.{ .length = 1, .start_offset = 0, .content = .{ .array_item_end = {} } }, | |
.{ .length = 0, .start_offset = 1, .content = .{ .array_item_start = {} } }, | |
.{ .length = 9, .start_offset = 0, .content = .{ .string_single_line = .{ .has_escapes = false } } }, | |
.{ .length = 1, .start_offset = 0, .content = .{ .array_item_end = {} } }, | |
.{ .length = 1, .start_offset = 1, .content = .{ .array_item_end = {} } }, | |
.{ .length = 0, .start_offset = 0, .content = .{ .object_entry_end = {} } }, | |
}, source); | |
} | |
test "parser: list array" { | |
const source = | |
\\obj: | |
\\ - value1 | |
\\ - true | |
\\ - key: 12 | |
\\ - key: | |
\\ foo: 34 | |
\\ - key: 56 | |
\\ foo: 78 | |
; | |
try testParsing(&.{ | |
.{ .length = 3, .start_offset = 0, .content = .{ .object_entry_start = {} } }, | |
.{ .length = 0, .start_offset = 6, .content = .{ .array_item_start = {} } }, | |
.{ .length = 6, .start_offset = 0, .content = .{ .string_single_line = .{ .has_escapes = false } } }, | |
.{ .length = 1, .start_offset = 0, .content = .{ .array_item_end = {} } }, | |
.{ .length = 0, .start_offset = 4, .content = .{ .array_item_start = {} } }, | |
.{ .length = 4, .start_offset = 0, .content = .{ .bool = true } }, | |
.{ .length = 1, .start_offset = 0, .content = .{ .array_item_end = {} } }, | |
.{ .length = 0, .start_offset = 4, .content = .{ .array_item_start = {} } }, | |
.{ .length = 3, .start_offset = 0, .content = .{ .object_entry_start = {} } }, | |
.{ .length = 2, .start_offset = 2, .content = .{ .integer = 12 } }, | |
.{ .length = 1, .start_offset = 0, .content = .{ .object_entry_end = {} } }, | |
.{ .length = 1, .start_offset = -1, .content = .{ .array_item_end = {} } }, | |
.{ .length = 0, .start_offset = 4, .content = .{ .array_item_start = {} } }, | |
.{ .length = 3, .start_offset = 0, .content = .{ .object_entry_start = {} } }, | |
.{ .length = 3, .start_offset = 8, .content = .{ .object_entry_start = {} } }, | |
.{ .length = 2, .start_offset = 2, .content = .{ .integer = 34 } }, | |
.{ .length = 1, .start_offset = 0, .content = .{ .object_entry_end = {} } }, | |
.{ .length = 1, .start_offset = -1, .content = .{ .object_entry_end = {} } }, | |
.{ .length = 1, .start_offset = -1, .content = .{ .array_item_end = {} } }, | |
.{ .length = 0, .start_offset = 4, .content = .{ .array_item_start = {} } }, | |
.{ .length = 3, .start_offset = 0, .content = .{ .object_entry_start = {} } }, | |
.{ .length = 2, .start_offset = 2, .content = .{ .integer = 56 } }, | |
.{ .length = 1, .start_offset = 0, .content = .{ .object_entry_end = {} } }, | |
.{ .length = 3, .start_offset = 4, .content = .{ .object_entry_start = {} } }, | |
.{ .length = 2, .start_offset = 2, .content = .{ .integer = 78 } }, | |
.{ .length = 0, .start_offset = 0, .content = .{ .object_entry_end = {} } }, | |
.{ .length = 0, .start_offset = 0, .content = .{ .array_item_end = {} } }, | |
.{ .length = 0, .start_offset = 0, .content = .{ .object_entry_end = {} } }, | |
}, source); | |
} | |
test "parser: bug #1" { | |
const source = | |
\\user: | |
\\ home_base_set: { type: boolean, default: false } | |
\\connections: | |
\\ - { source: frontend.app.services.auth, target: backend.auth } | |
\\ - { source: frontend.app.screens.login, target: frontend.app.services.auth } | |
; | |
try testParsing(&.{ | |
// user | |
.{ .length = 4, .start_offset = 0, .content = .{ .object_entry_start = {} } }, | |
// homebase | |
.{ .length = 13, .start_offset = 4, .content = .{ .object_entry_start = {} } }, | |
// type | |
.{ .length = 4, .start_offset = 4, .content = .{ .object_entry_start = {} } }, | |
.{ .length = 7, .start_offset = 2, .content = .{ .string_single_line = .{ .has_escapes = false } } }, | |
.{ .length = 1, .start_offset = 0, .content = .{ .object_entry_end = {} } }, | |
// default | |
.{ .length = 7, .start_offset = 1, .content = .{ .object_entry_start = {} } }, | |
.{ .length = 5, .start_offset = 2, .content = .{ .bool = false } }, | |
.{ .length = 1, .start_offset = 1, .content = .{ .object_entry_end = {} } }, | |
// homebase - end | |
.{ .length = 1, .start_offset = 0, .content = .{ .object_entry_end = {} } }, | |
// user - end | |
.{ .length = 1, .start_offset = -1, .content = .{ .object_entry_end = {} } }, | |
// connections | |
.{ .length = 11, .start_offset = 0, .content = .{ .object_entry_start = {} } }, | |
// connections[0] | |
.{ .length = 0, .start_offset = 8, .content = .{ .array_item_start = {} } }, | |
// source | |
.{ .length = 6, .start_offset = 0, .content = .{ .object_entry_start = {} } }, | |
.{ .length = 26, .start_offset = 2, .content = .{ .string_single_line = .{ .has_escapes = false } } }, | |
.{ .length = 1, .start_offset = 0, .content = .{ .object_entry_end = {} } }, | |
// target | |
.{ .length = 6, .start_offset = 1, .content = .{ .object_entry_start = {} } }, | |
.{ .length = 12, .start_offset = 2, .content = .{ .string_single_line = .{ .has_escapes = false } } }, | |
.{ .length = 1, .start_offset = 1, .content = .{ .object_entry_end = {} } }, | |
// connections[0] - end | |
.{ .length = 1, .start_offset = 0, .content = .{ .array_item_end = {} } }, | |
// connections[1] | |
.{ .length = 0, .start_offset = 6, .content = .{ .array_item_start = {} } }, | |
// source | |
.{ .length = 6, .start_offset = 0, .content = .{ .object_entry_start = {} } }, | |
.{ .length = 26, .start_offset = 2, .content = .{ .string_single_line = .{ .has_escapes = false } } }, | |
.{ .length = 1, .start_offset = 0, .content = .{ .object_entry_end = {} } }, | |
// target | |
.{ .length = 6, .start_offset = 1, .content = .{ .object_entry_start = {} } }, | |
.{ .length = 26, .start_offset = 2, .content = .{ .string_single_line = .{ .has_escapes = false } } }, | |
.{ .length = 1, .start_offset = 1, .content = .{ .object_entry_end = {} } }, | |
// connections[1] - end | |
.{ .length = 0, .start_offset = 0, .content = .{ .array_item_end = {} } }, | |
// connections - end | |
.{ .length = 0, .start_offset = 0, .content = .{ .object_entry_end = {} } }, | |
}, source); | |
} | |
test "parser: bug #2" { | |
const source = | |
\\components: | |
\\ children: {} | |
; | |
try testParsing(&.{ | |
.{ .length = 10, .start_offset = 0, .content = .{ .object_entry_start = {} } }, | |
.{ .length = 8, .start_offset = 4, .content = .{ .object_entry_start = {} } }, | |
.{ .length = 0, .start_offset = 4, .content = .{ .object_entry_end = {} } }, | |
.{ .length = 0, .start_offset = 0, .content = .{ .object_entry_end = {} } }, | |
}, source); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment