cpdt · February 6, 2020 11:31
diff --git a/flatparser.cpp b/flatparser.cpp
 #include "flatparser.h"

 // Parsing utils
 static bool is_whitespace(char c) {
    return c == ' ' || c == '\t';
 }

 static bool is_end_of_line(const char *text) {
    return *text == 0 || *text == '\n' || (text[0] == '\r' && text[1] == '\n');
 }

 static const char* skip_whitespace(const char *text) {
    while (is_whitespace(*text)) {
        text++;
    }
    return text;
 }

 static const char* find_end_of_line(const char *text) {
    while (!is_end_of_line(text)) {
        text++;
    }
    return text;
 }

 static bool read_hex(const char* text, size_t num_digits, unsigned& out) {
    out = 0;

    for (size_t digit_index = 0; digit_index < num_digits; digit_index++) {
        char character = text[digit_index];

        int character_value;
        if (character >= '0' && character <= '9') {
            character_value = character - '0';
        } else if (character >= 'A' && character <= 'F') {
            character_value = 10 + character - 'A';
        } else if (character >= 'a' && character <= 'f') {
            character_value = 10 + character - 'a';
        } else {
            return false;
        }

        out <<= 4;
        out += character_value;
    }

    return true;
 }

 static void write_utf8_char(unsigned ch, std::string& out) {
    // Convert a unicode character to a UTF8 sequence
    // This is a direct translation of the table at https://en.wikipedia.org/wiki/UTF-8#description
    if (ch < (1 << 7)) {
        out.push_back(static_cast<char>(ch));
    } else if (ch < (1 << 11)) {
        out.push_back(static_cast<char>(0b11000000 + ((ch & 0b000000000011111000000) >> 6)));
        out.push_back(static_cast<char>(0b10000000 +  (ch & 0b000000000000000111111)));
    } else if (ch < (1 << 16)) {
        out.push_back(static_cast<char>(0b11100000 + ((ch & 0b000001111000000000000) >> 12)));
        out.push_back(static_cast<char>(0b10000000 + ((ch & 0b000000000111111000000) >> 6)));
        out.push_back(static_cast<char>(0b10000000 +  (ch & 0b000000000000000111111)));
    } else {
        out.push_back(static_cast<char>(0b11110000 + ((ch & 0b111000000000000000000) >> 18)));
        out.push_back(static_cast<char>(0b10000000 + ((ch & 0b000111111000000000000) >> 12)));
        out.push_back(static_cast<char>(0b10000000 + ((ch & 0b000000000111111000000) >> 6)));
        out.push_back(static_cast<char>(0b10000000 +  (ch & 0b000000000000000111111)));
    }
 }

 static const char* process_escape_sequence(const char* text, std::string& out) {
    auto id_char = *text;
    text++;

    switch (id_char) {
    case 0:
        return text;
    case 'b':
        out.push_back('\b');
        break;
    case 'f':
        out.push_back('\f');
        break;
    case 'n':
        out.push_back('\n');
        break;
    case 'r':
        out.push_back('\r');
        break;
    case 't':
        out.push_back('\t');
        break;
    case '\n':
        // Line continuation: produce no characters but consume the newline
        break;
    case 'x': {
        // Hexadecimal escape sequence - the next 2 characters are the character in base 16
        unsigned char_code;
        if (!read_hex(text, 2, char_code)) {
            break;
        }
        write_utf8_char(char_code, out);
        text += 2;
        break;
    }
    case 'u': {
        // Unicode escape sequence - the next 4 characters are the character in base 16
        unsigned char_code;
        if (!read_hex(text, 4, char_code)) {
            break;
        }
        write_utf8_char(char_code, out);
        text += 4;
        break;
    }
    default:
        out.push_back(id_char);
        break;
    }

    return text;
 }

 static const char* read_escapable(const char* text, std::string& out, bool (&is_ended)(const char*)) {
    while (true) {
        // Find the next chunk delimiter
        const char* chunk_start = text;
        while (*text != '\\' && !is_end_of_line(text) && !is_ended(text)) {
            text++;
        }
        out.append(chunk_start, static_cast<size_t>(text - chunk_start));

        if (*text != '\\') {
            break;
        }
        
        text = process_escape_sequence(text + 1, out);
    }
    return text;
 }

 static bool has_key_ended(const char* text) {
    return *skip_whitespace(text) == '=';
 }

 static bool has_quoted_value_ended(const char* text) {
    return *text == '"';
 }

 static bool has_unqoted_value_ended(const char* text) {
    const char* after_ws = skip_whitespace(text);
    return is_end_of_line(after_ws);
 }

 static const char* parse_line(const char* text, OnRowCb& cb, void* ctx) {
    // Skip whitespace that might be at the start of a line
    text = skip_whitespace(text);

    // There are three types of lines:
    //   ; Single line comments - we can ignore everything until the next line
    //   Quoted="key value pairs" - after we extract the key and value contents, escape codes need to be replaced
    //   Unquoted=key value pairs - like the quoted ones but without quotes

    if (*text == ';') {
        // This line is a comment - don't bother doing anything with it
        return text;
    }

    // Read the key, handling possible escape sequences.
    std::string key;
    text = read_escapable(text, key, has_key_ended);

    // There might be whitespace left between the key and the =
    text = skip_whitespace(text);

    if (*text != '=') {
        // The key didn't end on what we expected, so this is either a newline or EOF.
        // In either case, it's an invalid line, we can't do anything more with it
        return text;
    }

    // Skip any whitespace that might be after the = but before the " to start the value
    text = skip_whitespace(text + 1);

    std::string value;

    if (*text == '"') {
        text = read_escapable(text + 1, value, has_quoted_value_ended);
    } else {
        text = read_escapable(text, value, has_unqoted_value_ended);
    }

    // The line is over - commit the data and go to the next line
    cb(ctx, std::move(key), std::move(value));

    return text;
 }

 void parse_flat_ini(const char *text, OnRowCb& cb, void* ctx) {
    while (*text != 0) {
        text = parse_line(text, cb, ctx);
        text = find_end_of_line(text);
        if (*text != 0) {
            text++;
        }
    }
 }
diff --git a/flatparser.h b/flatparser.h
 #pragma once

 #include <string>

 using OnRowCb = void (void* ctx, std::string&& key, std::string&& value);
 void parse_flat_ini(const char* text, OnRowCb& cb, void* ctx);
	#include "flatparser.h"

	// Parsing utils
	static bool is_whitespace(char c) {
	return c == ' ' \|\| c == '\t';
	}

	static bool is_end_of_line(const char *text) {
	return text == 0 \|\| text == '\n' \|\| (text[0] == '\r' && text[1] == '\n');
	}

	static const char* skip_whitespace(const char *text) {
	while (is_whitespace(*text)) {
	text++;
	}
	return text;
	}

	static const char* find_end_of_line(const char *text) {
	while (!is_end_of_line(text)) {
	text++;
	}
	return text;
	}

	static bool read_hex(const char* text, size_t num_digits, unsigned& out) {
	out = 0;

	for (size_t digit_index = 0; digit_index < num_digits; digit_index++) {
	char character = text[digit_index];

	int character_value;
	if (character >= '0' && character <= '9') {
	character_value = character - '0';
	} else if (character >= 'A' && character <= 'F') {
	character_value = 10 + character - 'A';
	} else if (character >= 'a' && character <= 'f') {
	character_value = 10 + character - 'a';
	} else {
	return false;
	}

	out <<= 4;
	out += character_value;
	}

	return true;
	}

	static void write_utf8_char(unsigned ch, std::string& out) {
	// Convert a unicode character to a UTF8 sequence
	// This is a direct translation of the table at https://en.wikipedia.org/wiki/UTF-8#description
	if (ch < (1 << 7)) {
	out.push_back(static_cast<char>(ch));
	} else if (ch < (1 << 11)) {
	out.push_back(static_cast<char>(0b11000000 + ((ch & 0b000000000011111000000) >> 6)));
	out.push_back(static_cast<char>(0b10000000 + (ch & 0b000000000000000111111)));
	} else if (ch < (1 << 16)) {
	out.push_back(static_cast<char>(0b11100000 + ((ch & 0b000001111000000000000) >> 12)));
	out.push_back(static_cast<char>(0b10000000 + ((ch & 0b000000000111111000000) >> 6)));
	out.push_back(static_cast<char>(0b10000000 + (ch & 0b000000000000000111111)));
	} else {
	out.push_back(static_cast<char>(0b11110000 + ((ch & 0b111000000000000000000) >> 18)));
	out.push_back(static_cast<char>(0b10000000 + ((ch & 0b000111111000000000000) >> 12)));
	out.push_back(static_cast<char>(0b10000000 + ((ch & 0b000000000111111000000) >> 6)));
	out.push_back(static_cast<char>(0b10000000 + (ch & 0b000000000000000111111)));
	}
	}

	static const char* process_escape_sequence(const char* text, std::string& out) {
	auto id_char = *text;
	text++;

	switch (id_char) {
	case 0:
	return text;
	case 'b':
	out.push_back('\b');
	break;
	case 'f':
	out.push_back('\f');
	break;
	case 'n':
	out.push_back('\n');
	break;
	case 'r':
	out.push_back('\r');
	break;
	case 't':
	out.push_back('\t');
	break;
	case '\n':
	// Line continuation: produce no characters but consume the newline
	break;
	case 'x': {
	// Hexadecimal escape sequence - the next 2 characters are the character in base 16
	unsigned char_code;
	if (!read_hex(text, 2, char_code)) {
	break;
	}
	write_utf8_char(char_code, out);
	text += 2;
	break;
	}
	case 'u': {
	// Unicode escape sequence - the next 4 characters are the character in base 16
	unsigned char_code;
	if (!read_hex(text, 4, char_code)) {
	break;
	}
	write_utf8_char(char_code, out);
	text += 4;
	break;
	}
	default:
	out.push_back(id_char);
	break;
	}

	return text;
	}

	static const char* read_escapable(const char* text, std::string& out, bool (&is_ended)(const char*)) {
	while (true) {
	// Find the next chunk delimiter
	const char* chunk_start = text;
	while (*text != '\\' && !is_end_of_line(text) && !is_ended(text)) {
	text++;
	}
	out.append(chunk_start, static_cast<size_t>(text - chunk_start));

	if (*text != '\\') {
	break;
	}

	text = process_escape_sequence(text + 1, out);
	}
	return text;
	}

	static bool has_key_ended(const char* text) {
	return *skip_whitespace(text) == '=';
	}

	static bool has_quoted_value_ended(const char* text) {
	return *text == '"';
	}

	static bool has_unqoted_value_ended(const char* text) {
	const char* after_ws = skip_whitespace(text);
	return is_end_of_line(after_ws);
	}

	static const char* parse_line(const char* text, OnRowCb& cb, void* ctx) {
	// Skip whitespace that might be at the start of a line
	text = skip_whitespace(text);

	// There are three types of lines:
	// ; Single line comments - we can ignore everything until the next line
	// Quoted="key value pairs" - after we extract the key and value contents, escape codes need to be replaced
	// Unquoted=key value pairs - like the quoted ones but without quotes

	if (*text == ';') {
	// This line is a comment - don't bother doing anything with it
	return text;
	}

	// Read the key, handling possible escape sequences.
	std::string key;
	text = read_escapable(text, key, has_key_ended);

	// There might be whitespace left between the key and the =
	text = skip_whitespace(text);

	if (*text != '=') {
	// The key didn't end on what we expected, so this is either a newline or EOF.
	// In either case, it's an invalid line, we can't do anything more with it
	return text;
	}

	// Skip any whitespace that might be after the = but before the " to start the value
	text = skip_whitespace(text + 1);

	std::string value;

	if (*text == '"') {
	text = read_escapable(text + 1, value, has_quoted_value_ended);
	} else {
	text = read_escapable(text, value, has_unqoted_value_ended);
	}

	// The line is over - commit the data and go to the next line
	cb(ctx, std::move(key), std::move(value));

	return text;
	}

	void parse_flat_ini(const char text, OnRowCb& cb, void ctx) {
	while (*text != 0) {
	text = parse_line(text, cb, ctx);
	text = find_end_of_line(text);
	if (*text != 0) {
	text++;
	}
	}
	}
	#pragma once

	#include <string>

	using OnRowCb = void (void* ctx, std::string&& key, std::string&& value);
	void parse_flat_ini(const char* text, OnRowCb& cb, void* ctx);