Skip to content

Instantly share code, notes, and snippets.

@Explorer09
Created March 30, 2025 19:02
Show Gist options
  • Save Explorer09/e3795a528d8c8b5d43dae054640c3733 to your computer and use it in GitHub Desktop.
Save Explorer09/e3795a528d8c8b5d43dae054640c3733 to your computer and use it in GitHub Desktop.
UTF-9 (RFC 4042) conversion functions
/** @file utf9.c
UTF-9 (RFC 4042) conversion functions.
These functions implement UTF-9, a transformation format in an April
Fools' RFC document, RFC 4042.
Copyright 2025 Kang-Che Sung <explorer09 @ gmail.com>
MIT License (MIT/Expat)
Permission is hereby granted, free of charge, to any person obtaining a
copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be included
in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
/* SPDX-License-Identifier: MIT */
#include <assert.h>
#include <stddef.h>
#include <stdint.h>
#if defined(UINT9_MAX) || defined(UINT9_WIDTH)
#define UINT9 uint9_t
#else
#define UINT9 uint_least16_t
#endif
struct utf9_decoder_state {
unsigned int value;
};
const UINT9 utf9_safe_terminator[] = {0x000, 0x000};
uint_least32_t utf9_to_code_point(const UINT9 *sequence[static 1])
{
// We follow the recommendation from WHATWG to "not mask" any
// subsequent byte in case of an illegal byte combination.
// That is, we stop whenever a byte that would cause the sequence
// to become invalid is found.
// (See <https://encoding.spec.whatwg.org/#security-background>.)
unsigned int nonet = *(*sequence)++;
if (nonet > 0x1FF) {
// Not a 9-bit byte. (This condition would be a no-op on a true
// 9-bit processor.)
return (uint_least32_t)-1;
}
if (nonet == 0x100) {
// The sequence would become overlong with this nonet.
// Subsequent nonets may form a valid sequence if this nonet is
// skipped.
return (uint_least32_t)-1;
}
if (nonet >= 0x1D8 && nonet <= 0x1DF) {
// Either a UTF-16 surrogate (U+D800 to U+DFFF) code unit, or a
// value that would be outside the Unicode range.
return (uint_least32_t)-1;
}
uint_least32_t value = nonet & 0xFF;
while (nonet >= 0x100) {
nonet = **sequence;
if (nonet > 0x1FF) {
// Not a 9-bit byte
return (uint_least32_t)-1;
}
if (value >= 0x11 && nonet >= 0x100) {
// Outside the [0x00, 0x10FFFF] range
return (uint_least32_t)-1;
}
value = (value << 8) | (nonet & 0xFF);
(*sequence)++;
}
return value;
}
uint_least32_t utf9_decoder_add_nonet(
struct utf9_decoder_state ps[static 1],
unsigned int nonet)
{
if (nonet > 0x1FF) {
// Not a 9-bit byte. (This condition would be a no-op on a true
// 9-bit processor.)
goto invalid_nonet;
}
if (ps->value == 0) {
if (nonet == 0x100) {
// The sequence would become overlong with this nonet.
// Subsequent nonets may form a valid sequence if this
// nonet is skipped.
return (uint_least32_t)-1;
}
if (nonet >= 0x1D8 && nonet <= 0x1DF) {
// Either a UTF-16 surrogate (U+D800 to U+DFFF) code unit,
// or a value that would be outside the Unicode range.
return (uint_least32_t)-1;
}
}
assert(ps->value <= 0x10FF);
if (ps->value >= 0x11 && nonet >= 0x100) {
// Outside the [0x00, 0x10FFFF] range
goto invalid_nonet;
}
uint_least32_t new_value = ((uint_least32_t)ps->value << 8) |
(nonet & 0xFF);
if (nonet < 0x100) {
ps->value = 0;
return new_value;
}
ps->value = (unsigned int)new_value;
return (uint_least32_t)-2;
invalid_nonet:
ps->value = 0;
return (uint_least32_t)-1;
}
size_t utf32_to_utf9(UINT9 buffer[/* max_length */], size_t max_length,
uint_least32_t code_point)
{
assert(code_point <= 0x10FFFF);
assert(code_point < 0xD800 || code_point > 0xDFFF);
size_t length = 0;
uint_least32_t upper_bits = code_point;
do {
length++;
} while ((upper_bits >>= 8) > 0);
if (length <= max_length) {
size_t i = length - 1;
do {
buffer[i] = code_point & 0xFF;
if (i < length - 1) {
buffer[i] |= 0x100;
}
code_point >>= 8;
} while (--i != (size_t)-1);
}
return length;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment