-
-
Save tylerneylon/9773800 to your computer and use it in GitHub Desktop.
| // This macro tests if a char is a continuation byte in utf8. | |
| #define IS_CONT(x) (((x) & 0xc0) == 0x80) | |
| // This returns the code point encoded at **s and advances *s to point to the | |
| // next character. Thus it can easily be used in a loop. | |
| int decode_code_point(char **s) { | |
| int k = **s ? __builtin_clz(~(**s << 24)) : 0; // Count # of leading 1 bits. | |
| int mask = (1 << (8 - k)) - 1; // All 1s with k leading 0s. | |
| int value = **s & mask; | |
| // k = 0 for one-byte code points; otherwise, k = #total bytes. | |
| for (++(*s), --k; k > 0 && IS_CONT(**s); --k, ++(*s)) { | |
| value <<= 6; | |
| value += (**s & 0x3F); | |
| } | |
| return value; | |
| } | |
| // This assumes that `code` is <= 0x10FFFF and ensures that nothing will be | |
| // written at or beyond `end`. It advances *s so it's easy to use in a loop. | |
| void encode_code_point(char **s, char *end, int code) { | |
| char val[4]; | |
| int lead_byte_max = 0x7F; | |
| int val_index = 0; | |
| while (code > lead_byte_max) { | |
| val[val_index++] = (code & 0x3F) | 0x80; | |
| code >>= 6; | |
| lead_byte_max >>= (val_index == 1 ? 2 : 1); | |
| } | |
| val[val_index++] = (code & lead_byte_max) | (~lead_byte_max << 1); | |
| while (val_index-- && *s < end) { | |
| **s = val[val_index]; | |
| (*s)++; | |
| } | |
| } | |
| // This returns 0 if no split was needed. | |
| int split_into_surrogates(int code, int *surr1, int *surr2) { | |
| if (code <= 0xFFFF) return 0; | |
| *surr2 = 0xDC00 | (code & 0x3FF); // Save the low 10 bits. | |
| code >>= 10; // Drop the low 10 bits. | |
| // If `code` now has low bits "uuu uuxx xxxx", then the bits of *surr are | |
| // "1101 10ww wwxx xxxx" where wwww = (uuuuu - 1). | |
| *surr1 = 0xD800 | ((code & 0x7FF) - 0x40); | |
| return 1; | |
| } | |
| // This expects to be used in a loop and see all code points in *code. Start | |
| // *old at 0; this function updates *old for you - don't change it after | |
| // initialization. This returns 0 when *code is the 1st of a surrogate pair; | |
| // otherwise use *code as the final code point. | |
| int join_from_surrogates(int *old, int *code) { | |
| if (*old) *code = (((*old & 0x3FF) + 0x40) << 10) + (*code & 0x3FF); | |
| *old = ((*code & 0xD800) == 0xD800 ? *code : 0); | |
| return !(*old); | |
| } |
@tylerneylon
The interesting thing is, neither ChatGPT or Google Gemini I played with would generate code as compact as mine.
My version compiles to 144 bytes with x86-64 GCC.
ChatGPT
https://chatgpt.com/share/0c457058-2e4a-44bb-93cd-cbd06334039a
Google Gemini
https://g.co/gemini/share/d3c4ed05bbcf
An update for people who visit this page via a web search:
My version of UTF-8 decoder and encoder are here:
https://gitlab.com/-/snippets/3718423
Note that I provided three versions of the decoder depending on how much error handling you need.
utf8_to_code_point- For decoding a string of UTF-8 sequences that you know will be all valid, or does not need to display more than one U+FFFD for any invalid sequence encountered.utf8_to_code_point_la1- For UTF-8 decoding that's fully compatible with WHATWG Encoding Standard. In particular the "Maximal Subparts" approach of converting invalid sequences to a series of U+FFFDs. (You may also read this for a coding challenge (code golf) of writing a UTF-8 decoder.)utf8_decoder_add_byte- For a decoder that you can manage autf8_decoder_stateobject by yourself. Theutf8_mbrtowcandutf8_mbsnrtowcsfunctions are C-like interfaces that utilizeutf8_decoder_statethat you can use right away.
The encoder function is utf32_to_utf8. There is no checking on whether the code point is valid before encoding, as I think that checking is best done by the caller code. The function has a bound check on the output buffer, though.
@Explorer09 thanks for the snapshot of your code and the notes. Of course many people will appreciate error-checking code! 👍