Skip to content

Instantly share code, notes, and snippets.

@tommai78101
Last active February 20, 2025 11:42
Show Gist options
  • Save tommai78101/3631ed1f136b78238e85582f08bdc618 to your computer and use it in GitHub Desktop.
Save tommai78101/3631ed1f136b78238e85582f08bdc618 to your computer and use it in GitHub Desktop.
UTF-8 to UTF-16 one-way conversion, written in C
#include <stdio.h>
#include <stdlib.h>
#include <uchar.h>
#include <locale.h>
#define __STD_UTF_16__
//Pointer arrays must always include the array size, because pointers do not know about the size of the supposed array size.
void utf8_to_utf16(unsigned char* const utf8_str, int utf8_str_size, char16_t* utf16_str_output, int utf16_str_output_size) {
//First, grab the first byte of the UTF-8 string
unsigned char* utf8_currentCodeUnit = utf8_str;
char16_t* utf16_currentCodeUnit = utf16_str_output;
int utf8_str_iterator = 0;
int utf16_str_iterator = 0;
//In a while loop, we check if the UTF-16 iterator is less than the max output size. If true, then we check if UTF-8 iterator
//is less than UTF-8 max string size. This conditional checking based on order of precedence is intentionally done so it
//prevents the while loop from continuing onwards if the iterators are outside of the intended sizes.
while (*utf8_currentCodeUnit && (utf16_str_iterator < utf16_str_output_size || utf8_str_iterator < utf8_str_size)) {
//Figure out the current code unit to determine the range. It is split into 6 main groups, each of which handles the data
//differently from one another.
if (*utf8_currentCodeUnit < 0x80) {
//0..127, the ASCII range.
//We directly plug in the values to the UTF-16 code unit.
*utf16_currentCodeUnit = (char16_t) (*utf8_currentCodeUnit);
utf16_currentCodeUnit++;
utf16_str_iterator++;
//Increment the current code unit pointer to the next code unit
utf8_currentCodeUnit++;
//Increment the iterator to keep track of where we are in the UTF-8 string
utf8_str_iterator++;
}
else if (*utf8_currentCodeUnit < 0xC0) {
//0x80..0xBF, we ignore. These are reserved for UTF-8 encoding.
utf8_currentCodeUnit++;
utf8_str_iterator++;
}
else if (*utf8_currentCodeUnit < 0xE0) {
//128..2047, the extended ASCII range, and into the Basic Multilingual Plane.
//Work on the first code unit.
char16_t highShort = (char16_t) ((*utf8_currentCodeUnit) & 0x1F);
//Increment the current code unit pointer to the next code unit
utf8_currentCodeUnit++;
//Work on the second code unit.
char16_t lowShort = (char16_t) ((*utf8_currentCodeUnit) & 0x3F);
//Increment the current code unit pointer to the next code unit
utf8_currentCodeUnit++;
//Create the UTF-16 code unit, then increment the iterator.
//Credits to @tbeu.
//Thanks to @k6l2 for explaining why we need 6 instead of 8.
//It's because 0x3F is 6 bits of information from the low short. By shifting 8 bits, you are
//adding 2 extra zeroes in between the actual data of both shorts.
int unicode = (highShort << 6) | lowShort;
//Check to make sure the "unicode" is in the range [0..D7FF] and [E000..FFFF].
if ((0 <= unicode && unicode <= 0xD7FF) || (0xE000 <= unicode && unicode <= 0xFFFF)) {
//Directly set the value to the UTF-16 code unit.
*utf16_currentCodeUnit = (char16_t) unicode;
utf16_currentCodeUnit++;
utf16_str_iterator++;
}
//Increment the iterator to keep track of where we are in the UTF-8 string
utf8_str_iterator += 2;
}
else if (*utf8_currentCodeUnit < 0xF0) {
//2048..65535, the remaining Basic Multilingual Plane.
//Work on the UTF-8 code units one by one.
//If drawn out, it would be 1110aaaa 10bbbbcc 10ccdddd
//Where a is 4th byte, b is 3rd byte, c is 2nd byte, and d is 1st byte.
char16_t fourthChar = (char16_t) ((*utf8_currentCodeUnit) & 0xF);
utf8_currentCodeUnit++;
char16_t thirdChar = (char16_t) ((*utf8_currentCodeUnit) & 0x3C) >> 2;
char16_t secondCharHigh = (char16_t) ((*utf8_currentCodeUnit) & 0x3);
utf8_currentCodeUnit++;
char16_t secondCharLow = (char16_t) ((*utf8_currentCodeUnit) & 0x30) >> 4;
char16_t firstChar = (char16_t) ((*utf8_currentCodeUnit) & 0xF);
utf8_currentCodeUnit++;
//Create the resulting UTF-16 code unit, then increment the iterator.
int unicode = (fourthChar << 12) | (thirdChar << 8) | (secondCharHigh << 6) | (secondCharLow << 4) | firstChar;
//Check to make sure the "unicode" is in the range [0..D7FF] and [E000..FFFF].
//According to math, UTF-8 encoded "unicode" should always fall within these two ranges.
if ((0 <= unicode && unicode <= 0xD7FF) || (0xE000 <= unicode && unicode <= 0xFFFF)) {
//Directly set the value to the UTF-16 code unit.
*utf16_currentCodeUnit = (char16_t) unicode;
utf16_currentCodeUnit++;
utf16_str_iterator++;
}
//Increment the iterator to keep track of where we are in the UTF-8 string
utf8_str_iterator += 3;
}
else if (*utf8_currentCodeUnit < 0xF8) {
//65536..10FFFF, the Unicode UTF range
//Work on the UTF-8 code units one by one.
//If drawn out, it would be 11110abb 10bbcccc 10ddddee 10eeffff
//Where a is 6th byte, b is 5th byte, c is 4th byte, and so on.
char16_t sixthChar = (char16_t) ((*utf8_currentCodeUnit) & 0x4) >> 2;
char16_t fifthCharHigh = (char16_t) ((*utf8_currentCodeUnit) & 0x3);
utf8_currentCodeUnit++;
char16_t fifthCharLow = (char16_t) ((*utf8_currentCodeUnit) & 0x30) >> 4;
char16_t fourthChar = (char16_t) ((*utf8_currentCodeUnit) & 0xF);
utf8_currentCodeUnit++;
char16_t thirdChar = (char16_t) ((*utf8_currentCodeUnit) & 0x3C) >> 2;
char16_t secondCharHigh = (char16_t) ((*utf8_currentCodeUnit) & 0x3);
utf8_currentCodeUnit++;
char16_t secondCharLow = (char16_t) ((*utf8_currentCodeUnit) & 0x30) >> 4;
char16_t firstChar = (char16_t) ((*utf8_currentCodeUnit) & 0xF);
utf8_currentCodeUnit++;
int unicode = (sixthChar << 4) | (fifthCharHigh << 2) | fifthCharLow | (fourthChar << 12) | (thirdChar << 8) | (secondCharHigh << 6) | (secondCharLow << 4) | firstChar;
char16_t highSurrogate = (unicode - 0x10000) / 0x400 + 0xD800;
char16_t lowSurrogate = (unicode - 0x10000) % 0x400 + 0xDC00;
//Set the UTF-16 code units
*utf16_currentCodeUnit = lowSurrogate;
utf16_currentCodeUnit++;
utf16_str_iterator++;
//Check to see if we're still below the output string size before continuing, otherwise, we cut off here.
if (utf16_str_iterator < utf16_str_output_size) {
*utf16_currentCodeUnit = highSurrogate;
utf16_currentCodeUnit++;
utf16_str_iterator++;
}
//Increment the iterator to keep track of where we are in the UTF-8 string
utf8_str_iterator += 4;
}
else {
//Invalid UTF-8 code unit, we ignore.
utf8_currentCodeUnit++;
utf8_str_iterator++;
}
}
//We clean up the output string if the UTF-16 iterator is still less than the output string size.
while (utf16_str_iterator < utf16_str_output_size) {
*utf16_currentCodeUnit = '\0';
utf16_currentCodeUnit++;
utf16_str_iterator++;
}
}
int main() {
unsigned char array[] = u8"我是誰?";
char16_t output[25];
utf8_to_utf16(&array[0], sizeof(array) / sizeof(array[0]), &output[0], sizeof(output) / sizeof(output[0]));
//Set debug breakpoint on the return statement, and inspect "output" variable.
return 0;
}
@tommai78101
Copy link
Author

MIT Licensed.

@tbeu
Copy link

tbeu commented Mar 14, 2021

In line 57 it should read int unicode = (highShort << 6) | lowShort;, i.e. only shifted by 6 bits.

@tommai78101
Copy link
Author

tommai78101 commented Mar 17, 2021

@tbeu Could you explain why it's 6, and not 8? I thought the low and high shorts are 8 bits long?

@k6l2
Copy link

k6l2 commented Jun 22, 2022

@tbeu Could you explain why it's 6, and not 8? I thought the low and high shorts are 8 bits long?

It's because 0x3F is 6 bits of information from the low short. By shifting 8 bits, you are adding 2 extra zeroes in between the actual data of both shorts.

@tommai78101
Copy link
Author

Thank you. I would never be able to see the problem without that explanation.

@matu3ba
Copy link

matu3ba commented Feb 20, 2025

  1. These both look superflous and make it unnecessary C11 instead of more common C99:
#include <uchar.h>
#include <locale.h>

You can use instead int16_t. __STD_UTF_16__ then also becomes unnecessary.
2. Likewise you can simply use uint8_t instead of unsigned char.
3. Also bear in mind, that plain pointer casting may require to disable aliasing or use a global local to memcpy out the memory section.
Future compilers may silently miscompile your code, if you dont provide aliasing hints.
4. Normal user and compilers since something like gcc 4 use utf8 encoded files as inputs and sane windows users use /utf-8 as additional flag, so there is no need for not C99 compatible u8"我是誰?";
5. Error condition documentation is missing. Or basically that there is no error checking whatsoever.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment