Explorer09 · March 30, 2025 19:02
diff --git a/utf9.c b/utf9.c
 /** @file utf9.c
 UTF-9 (RFC 4042) conversion functions.

 These functions implement UTF-9, a transformation format in an April
 Fools' RFC document, RFC 4042.

 Copyright 2025 Kang-Che Sung <explorer09 @ gmail.com>

 MIT License (MIT/Expat)

 Permission is hereby granted, free of charge, to any person obtaining a
 copy of this software and associated documentation files (the
 "Software"), to deal in the Software without restriction, including
 without limitation the rights to use, copy, modify, merge, publish,
 distribute, sublicense, and/or sell copies of the Software, and to
 permit persons to whom the Software is furnished to do so, subject to
 the following conditions:

 The above copyright notice and this permission notice shall be included
 in all copies or substantial portions of the Software.

 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
 OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */
 /* SPDX-License-Identifier: MIT */

 #include <assert.h>
 #include <stddef.h>
 #include <stdint.h>

 #if defined(UINT9_MAX) || defined(UINT9_WIDTH)
 #define UINT9 uint9_t
 #else
 #define UINT9 uint_least16_t
 #endif

 struct utf9_decoder_state {
    unsigned int value;
 };

 const UINT9 utf9_safe_terminator[] = {0x000, 0x000};

 uint_least32_t utf9_to_code_point(const UINT9 *sequence[static 1])
 {
    // We follow the recommendation from WHATWG to "not mask" any
    // subsequent byte in case of an illegal byte combination.
    // That is, we stop whenever a byte that would cause the sequence
    // to become invalid is found.
    // (See <https://encoding.spec.whatwg.org/#security-background>.)

    unsigned int nonet = *(*sequence)++;

    if (nonet > 0x1FF) {
        // Not a 9-bit byte. (This condition would be a no-op on a true
        // 9-bit processor.)
        return (uint_least32_t)-1;
    }
    if (nonet == 0x100) {
        // The sequence would become overlong with this nonet.
        // Subsequent nonets may form a valid sequence if this nonet is
        // skipped.
        return (uint_least32_t)-1;
    }
    if (nonet >= 0x1D8 && nonet <= 0x1DF) {
        // Either a UTF-16 surrogate (U+D800 to U+DFFF) code unit, or a
        // value that would be outside the Unicode range.
        return (uint_least32_t)-1;
    }

    uint_least32_t value = nonet & 0xFF;

    while (nonet >= 0x100) {
        nonet = **sequence;

        if (nonet > 0x1FF) {
            // Not a 9-bit byte
            return (uint_least32_t)-1;
        }
        if (value >= 0x11 && nonet >= 0x100) {
            // Outside the [0x00, 0x10FFFF] range
            return (uint_least32_t)-1;
        }

        value = (value << 8) | (nonet & 0xFF);
        (*sequence)++;
    }

    return value;
 }

 uint_least32_t utf9_decoder_add_nonet(
                   struct utf9_decoder_state ps[static 1],
                   unsigned int nonet)
 {
    if (nonet > 0x1FF) {
        // Not a 9-bit byte. (This condition would be a no-op on a true
        // 9-bit processor.)
        goto invalid_nonet;
    }

    if (ps->value == 0) {
        if (nonet == 0x100) {
            // The sequence would become overlong with this nonet.
            // Subsequent nonets may form a valid sequence if this
            // nonet is skipped.
            return (uint_least32_t)-1;
        }
        if (nonet >= 0x1D8 && nonet <= 0x1DF) {
            // Either a UTF-16 surrogate (U+D800 to U+DFFF) code unit,
            // or a value that would be outside the Unicode range.
            return (uint_least32_t)-1;
        }
    }

    assert(ps->value <= 0x10FF);

    if (ps->value >= 0x11 && nonet >= 0x100) {
        // Outside the [0x00, 0x10FFFF] range
        goto invalid_nonet;
    }

    uint_least32_t new_value = ((uint_least32_t)ps->value << 8) |
        (nonet & 0xFF);
    if (nonet < 0x100) {
        ps->value = 0;
        return new_value;
    }

    ps->value = (unsigned int)new_value;
    return (uint_least32_t)-2;

 invalid_nonet:
    ps->value = 0;
    return (uint_least32_t)-1;
 }

 size_t utf32_to_utf9(UINT9 buffer[/* max_length */], size_t max_length,
                     uint_least32_t code_point)
 {
    assert(code_point <= 0x10FFFF);
    assert(code_point < 0xD800 || code_point > 0xDFFF);
    size_t length = 0;
    uint_least32_t upper_bits = code_point;
    do {
        length++;
    } while ((upper_bits >>= 8) > 0);

    if (length <= max_length) {
        size_t i = length - 1;
        do {
            buffer[i] = code_point & 0xFF;
            if (i < length - 1) {
                buffer[i] |= 0x100;
            }
            code_point >>= 8;
        } while (--i != (size_t)-1);
    }
    return length;
 }
	/** @file utf9.c
	UTF-9 (RFC 4042) conversion functions.

	These functions implement UTF-9, a transformation format in an April
	Fools' RFC document, RFC 4042.

	Copyright 2025 Kang-Che Sung <explorer09 @ gmail.com>

	MIT License (MIT/Expat)

	Permission is hereby granted, free of charge, to any person obtaining a
	copy of this software and associated documentation files (the
	"Software"), to deal in the Software without restriction, including
	without limitation the rights to use, copy, modify, merge, publish,
	distribute, sublicense, and/or sell copies of the Software, and to
	permit persons to whom the Software is furnished to do so, subject to
	the following conditions:

	The above copyright notice and this permission notice shall be included
	in all copies or substantial portions of the Software.

	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
	*/
	/* SPDX-License-Identifier: MIT */

	#include <assert.h>
	#include <stddef.h>
	#include <stdint.h>

	#if defined(UINT9_MAX) \|\| defined(UINT9_WIDTH)
	#define UINT9 uint9_t
	#else
	#define UINT9 uint_least16_t
	#endif

	struct utf9_decoder_state {
	unsigned int value;
	};

	const UINT9 utf9_safe_terminator[] = {0x000, 0x000};

	uint_least32_t utf9_to_code_point(const UINT9 *sequence[static 1])
	{
	// We follow the recommendation from WHATWG to "not mask" any
	// subsequent byte in case of an illegal byte combination.
	// That is, we stop whenever a byte that would cause the sequence
	// to become invalid is found.
	// (See <https://encoding.spec.whatwg.org/#security-background>.)

	unsigned int nonet = (sequence)++;

	if (nonet > 0x1FF) {
	// Not a 9-bit byte. (This condition would be a no-op on a true
	// 9-bit processor.)
	return (uint_least32_t)-1;
	}
	if (nonet == 0x100) {
	// The sequence would become overlong with this nonet.
	// Subsequent nonets may form a valid sequence if this nonet is
	// skipped.
	return (uint_least32_t)-1;
	}
	if (nonet >= 0x1D8 && nonet <= 0x1DF) {
	// Either a UTF-16 surrogate (U+D800 to U+DFFF) code unit, or a
	// value that would be outside the Unicode range.
	return (uint_least32_t)-1;
	}

	uint_least32_t value = nonet & 0xFF;

	while (nonet >= 0x100) {
	nonet = **sequence;

	if (nonet > 0x1FF) {
	// Not a 9-bit byte
	return (uint_least32_t)-1;
	}
	if (value >= 0x11 && nonet >= 0x100) {
	// Outside the [0x00, 0x10FFFF] range
	return (uint_least32_t)-1;
	}

	value = (value << 8) \| (nonet & 0xFF);
	(*sequence)++;
	}

	return value;
	}

	uint_least32_t utf9_decoder_add_nonet(
	struct utf9_decoder_state ps[static 1],
	unsigned int nonet)
	{
	if (nonet > 0x1FF) {
	// Not a 9-bit byte. (This condition would be a no-op on a true
	// 9-bit processor.)
	goto invalid_nonet;
	}

	if (ps->value == 0) {
	if (nonet == 0x100) {
	// The sequence would become overlong with this nonet.
	// Subsequent nonets may form a valid sequence if this
	// nonet is skipped.
	return (uint_least32_t)-1;
	}
	if (nonet >= 0x1D8 && nonet <= 0x1DF) {
	// Either a UTF-16 surrogate (U+D800 to U+DFFF) code unit,
	// or a value that would be outside the Unicode range.
	return (uint_least32_t)-1;
	}
	}

	assert(ps->value <= 0x10FF);

	if (ps->value >= 0x11 && nonet >= 0x100) {
	// Outside the [0x00, 0x10FFFF] range
	goto invalid_nonet;
	}

	uint_least32_t new_value = ((uint_least32_t)ps->value << 8) \|
	(nonet & 0xFF);
	if (nonet < 0x100) {
	ps->value = 0;
	return new_value;
	}

	ps->value = (unsigned int)new_value;
	return (uint_least32_t)-2;

	invalid_nonet:
	ps->value = 0;
	return (uint_least32_t)-1;
	}

	size_t utf32_to_utf9(UINT9 buffer[/* max_length */], size_t max_length,
	uint_least32_t code_point)
	{
	assert(code_point <= 0x10FFFF);
	assert(code_point < 0xD800 \|\| code_point > 0xDFFF);
	size_t length = 0;
	uint_least32_t upper_bits = code_point;
	do {
	length++;
	} while ((upper_bits >>= 8) > 0);

	if (length <= max_length) {
	size_t i = length - 1;
	do {
	buffer[i] = code_point & 0xFF;
	if (i < length - 1) {
	buffer[i] \|= 0x100;
	}
	code_point >>= 8;
	} while (--i != (size_t)-1);
	}
	return length;
	}