Last active
February 6, 2020 11:31
-
-
Save cpdt/5a05c3db8b7a91961b506136fd1f77b0 to your computer and use it in GitHub Desktop.
A simple and pretty fast "flat INI" parser - for when you just need to read a list of key/value pairs that can have escape sequences
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include "flatparser.h" | |
// Parsing utils | |
static bool is_whitespace(char c) { | |
return c == ' ' || c == '\t'; | |
} | |
static bool is_end_of_line(const char *text) { | |
return *text == 0 || *text == '\n' || (text[0] == '\r' && text[1] == '\n'); | |
} | |
static const char* skip_whitespace(const char *text) { | |
while (is_whitespace(*text)) { | |
text++; | |
} | |
return text; | |
} | |
static const char* find_end_of_line(const char *text) { | |
while (!is_end_of_line(text)) { | |
text++; | |
} | |
return text; | |
} | |
static bool read_hex(const char* text, size_t num_digits, unsigned& out) { | |
out = 0; | |
for (size_t digit_index = 0; digit_index < num_digits; digit_index++) { | |
char character = text[digit_index]; | |
int character_value; | |
if (character >= '0' && character <= '9') { | |
character_value = character - '0'; | |
} else if (character >= 'A' && character <= 'F') { | |
character_value = 10 + character - 'A'; | |
} else if (character >= 'a' && character <= 'f') { | |
character_value = 10 + character - 'a'; | |
} else { | |
return false; | |
} | |
out <<= 4; | |
out += character_value; | |
} | |
return true; | |
} | |
static void write_utf8_char(unsigned ch, std::string& out) { | |
// Convert a unicode character to a UTF8 sequence | |
// This is a direct translation of the table at https://en.wikipedia.org/wiki/UTF-8#description | |
if (ch < (1 << 7)) { | |
out.push_back(static_cast<char>(ch)); | |
} else if (ch < (1 << 11)) { | |
out.push_back(static_cast<char>(0b11000000 + ((ch & 0b000000000011111000000) >> 6))); | |
out.push_back(static_cast<char>(0b10000000 + (ch & 0b000000000000000111111))); | |
} else if (ch < (1 << 16)) { | |
out.push_back(static_cast<char>(0b11100000 + ((ch & 0b000001111000000000000) >> 12))); | |
out.push_back(static_cast<char>(0b10000000 + ((ch & 0b000000000111111000000) >> 6))); | |
out.push_back(static_cast<char>(0b10000000 + (ch & 0b000000000000000111111))); | |
} else { | |
out.push_back(static_cast<char>(0b11110000 + ((ch & 0b111000000000000000000) >> 18))); | |
out.push_back(static_cast<char>(0b10000000 + ((ch & 0b000111111000000000000) >> 12))); | |
out.push_back(static_cast<char>(0b10000000 + ((ch & 0b000000000111111000000) >> 6))); | |
out.push_back(static_cast<char>(0b10000000 + (ch & 0b000000000000000111111))); | |
} | |
} | |
static const char* process_escape_sequence(const char* text, std::string& out) { | |
auto id_char = *text; | |
text++; | |
switch (id_char) { | |
case 0: | |
return text; | |
case 'b': | |
out.push_back('\b'); | |
break; | |
case 'f': | |
out.push_back('\f'); | |
break; | |
case 'n': | |
out.push_back('\n'); | |
break; | |
case 'r': | |
out.push_back('\r'); | |
break; | |
case 't': | |
out.push_back('\t'); | |
break; | |
case '\n': | |
// Line continuation: produce no characters but consume the newline | |
break; | |
case 'x': { | |
// Hexadecimal escape sequence - the next 2 characters are the character in base 16 | |
unsigned char_code; | |
if (!read_hex(text, 2, char_code)) { | |
break; | |
} | |
write_utf8_char(char_code, out); | |
text += 2; | |
break; | |
} | |
case 'u': { | |
// Unicode escape sequence - the next 4 characters are the character in base 16 | |
unsigned char_code; | |
if (!read_hex(text, 4, char_code)) { | |
break; | |
} | |
write_utf8_char(char_code, out); | |
text += 4; | |
break; | |
} | |
default: | |
out.push_back(id_char); | |
break; | |
} | |
return text; | |
} | |
static const char* read_escapable(const char* text, std::string& out, bool (&is_ended)(const char*)) { | |
while (true) { | |
// Find the next chunk delimiter | |
const char* chunk_start = text; | |
while (*text != '\\' && !is_end_of_line(text) && !is_ended(text)) { | |
text++; | |
} | |
out.append(chunk_start, static_cast<size_t>(text - chunk_start)); | |
if (*text != '\\') { | |
break; | |
} | |
text = process_escape_sequence(text + 1, out); | |
} | |
return text; | |
} | |
static bool has_key_ended(const char* text) { | |
return *skip_whitespace(text) == '='; | |
} | |
static bool has_quoted_value_ended(const char* text) { | |
return *text == '"'; | |
} | |
static bool has_unqoted_value_ended(const char* text) { | |
const char* after_ws = skip_whitespace(text); | |
return is_end_of_line(after_ws); | |
} | |
static const char* parse_line(const char* text, OnRowCb& cb, void* ctx) { | |
// Skip whitespace that might be at the start of a line | |
text = skip_whitespace(text); | |
// There are three types of lines: | |
// ; Single line comments - we can ignore everything until the next line | |
// Quoted="key value pairs" - after we extract the key and value contents, escape codes need to be replaced | |
// Unquoted=key value pairs - like the quoted ones but without quotes | |
if (*text == ';') { | |
// This line is a comment - don't bother doing anything with it | |
return text; | |
} | |
// Read the key, handling possible escape sequences. | |
std::string key; | |
text = read_escapable(text, key, has_key_ended); | |
// There might be whitespace left between the key and the = | |
text = skip_whitespace(text); | |
if (*text != '=') { | |
// The key didn't end on what we expected, so this is either a newline or EOF. | |
// In either case, it's an invalid line, we can't do anything more with it | |
return text; | |
} | |
// Skip any whitespace that might be after the = but before the " to start the value | |
text = skip_whitespace(text + 1); | |
std::string value; | |
if (*text == '"') { | |
text = read_escapable(text + 1, value, has_quoted_value_ended); | |
} else { | |
text = read_escapable(text, value, has_unqoted_value_ended); | |
} | |
// The line is over - commit the data and go to the next line | |
cb(ctx, std::move(key), std::move(value)); | |
return text; | |
} | |
void parse_flat_ini(const char *text, OnRowCb& cb, void* ctx) { | |
while (*text != 0) { | |
text = parse_line(text, cb, ctx); | |
text = find_end_of_line(text); | |
if (*text != 0) { | |
text++; | |
} | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#pragma once | |
#include <string> | |
using OnRowCb = void (void* ctx, std::string&& key, std::string&& value); | |
void parse_flat_ini(const char* text, OnRowCb& cb, void* ctx); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment