Created
March 30, 2025 19:02
-
-
Save Explorer09/e3795a528d8c8b5d43dae054640c3733 to your computer and use it in GitHub Desktop.
UTF-9 (RFC 4042) conversion functions
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** @file utf9.c | |
UTF-9 (RFC 4042) conversion functions. | |
These functions implement UTF-9, a transformation format in an April | |
Fools' RFC document, RFC 4042. | |
Copyright 2025 Kang-Che Sung <explorer09 @ gmail.com> | |
MIT License (MIT/Expat) | |
Permission is hereby granted, free of charge, to any person obtaining a | |
copy of this software and associated documentation files (the | |
"Software"), to deal in the Software without restriction, including | |
without limitation the rights to use, copy, modify, merge, publish, | |
distribute, sublicense, and/or sell copies of the Software, and to | |
permit persons to whom the Software is furnished to do so, subject to | |
the following conditions: | |
The above copyright notice and this permission notice shall be included | |
in all copies or substantial portions of the Software. | |
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS | |
OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | |
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. | |
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY | |
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, | |
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE | |
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. | |
*/ | |
/* SPDX-License-Identifier: MIT */ | |
#include <assert.h> | |
#include <stddef.h> | |
#include <stdint.h> | |
#if defined(UINT9_MAX) || defined(UINT9_WIDTH) | |
#define UINT9 uint9_t | |
#else | |
#define UINT9 uint_least16_t | |
#endif | |
struct utf9_decoder_state { | |
unsigned int value; | |
}; | |
const UINT9 utf9_safe_terminator[] = {0x000, 0x000}; | |
uint_least32_t utf9_to_code_point(const UINT9 *sequence[static 1]) | |
{ | |
// We follow the recommendation from WHATWG to "not mask" any | |
// subsequent byte in case of an illegal byte combination. | |
// That is, we stop whenever a byte that would cause the sequence | |
// to become invalid is found. | |
// (See <https://encoding.spec.whatwg.org/#security-background>.) | |
unsigned int nonet = *(*sequence)++; | |
if (nonet > 0x1FF) { | |
// Not a 9-bit byte. (This condition would be a no-op on a true | |
// 9-bit processor.) | |
return (uint_least32_t)-1; | |
} | |
if (nonet == 0x100) { | |
// The sequence would become overlong with this nonet. | |
// Subsequent nonets may form a valid sequence if this nonet is | |
// skipped. | |
return (uint_least32_t)-1; | |
} | |
if (nonet >= 0x1D8 && nonet <= 0x1DF) { | |
// Either a UTF-16 surrogate (U+D800 to U+DFFF) code unit, or a | |
// value that would be outside the Unicode range. | |
return (uint_least32_t)-1; | |
} | |
uint_least32_t value = nonet & 0xFF; | |
while (nonet >= 0x100) { | |
nonet = **sequence; | |
if (nonet > 0x1FF) { | |
// Not a 9-bit byte | |
return (uint_least32_t)-1; | |
} | |
if (value >= 0x11 && nonet >= 0x100) { | |
// Outside the [0x00, 0x10FFFF] range | |
return (uint_least32_t)-1; | |
} | |
value = (value << 8) | (nonet & 0xFF); | |
(*sequence)++; | |
} | |
return value; | |
} | |
uint_least32_t utf9_decoder_add_nonet( | |
struct utf9_decoder_state ps[static 1], | |
unsigned int nonet) | |
{ | |
if (nonet > 0x1FF) { | |
// Not a 9-bit byte. (This condition would be a no-op on a true | |
// 9-bit processor.) | |
goto invalid_nonet; | |
} | |
if (ps->value == 0) { | |
if (nonet == 0x100) { | |
// The sequence would become overlong with this nonet. | |
// Subsequent nonets may form a valid sequence if this | |
// nonet is skipped. | |
return (uint_least32_t)-1; | |
} | |
if (nonet >= 0x1D8 && nonet <= 0x1DF) { | |
// Either a UTF-16 surrogate (U+D800 to U+DFFF) code unit, | |
// or a value that would be outside the Unicode range. | |
return (uint_least32_t)-1; | |
} | |
} | |
assert(ps->value <= 0x10FF); | |
if (ps->value >= 0x11 && nonet >= 0x100) { | |
// Outside the [0x00, 0x10FFFF] range | |
goto invalid_nonet; | |
} | |
uint_least32_t new_value = ((uint_least32_t)ps->value << 8) | | |
(nonet & 0xFF); | |
if (nonet < 0x100) { | |
ps->value = 0; | |
return new_value; | |
} | |
ps->value = (unsigned int)new_value; | |
return (uint_least32_t)-2; | |
invalid_nonet: | |
ps->value = 0; | |
return (uint_least32_t)-1; | |
} | |
size_t utf32_to_utf9(UINT9 buffer[/* max_length */], size_t max_length, | |
uint_least32_t code_point) | |
{ | |
assert(code_point <= 0x10FFFF); | |
assert(code_point < 0xD800 || code_point > 0xDFFF); | |
size_t length = 0; | |
uint_least32_t upper_bits = code_point; | |
do { | |
length++; | |
} while ((upper_bits >>= 8) > 0); | |
if (length <= max_length) { | |
size_t i = length - 1; | |
do { | |
buffer[i] = code_point & 0xFF; | |
if (i < length - 1) { | |
buffer[i] |= 0x100; | |
} | |
code_point >>= 8; | |
} while (--i != (size_t)-1); | |
} | |
return length; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment