Skip to content

Instantly share code, notes, and snippets.

@tommai78101
Last active June 4, 2024 20:41
Show Gist options
  • Save tommai78101/3631ed1f136b78238e85582f08bdc618 to your computer and use it in GitHub Desktop.
Save tommai78101/3631ed1f136b78238e85582f08bdc618 to your computer and use it in GitHub Desktop.
UTF-8 to UTF-16 one-way conversion, written in C
#include <stdio.h>
#include <stdlib.h>
#include <uchar.h>
#include <locale.h>
#define __STD_UTF_16__
//Pointer arrays must always include the array size, because pointers do not know about the size of the supposed array size.
void utf8_to_utf16(unsigned char* const utf8_str, int utf8_str_size, char16_t* utf16_str_output, int utf16_str_output_size) {
//First, grab the first byte of the UTF-8 string
unsigned char* utf8_currentCodeUnit = utf8_str;
char16_t* utf16_currentCodeUnit = utf16_str_output;
int utf8_str_iterator = 0;
int utf16_str_iterator = 0;
//In a while loop, we check if the UTF-16 iterator is less than the max output size. If true, then we check if UTF-8 iterator
//is less than UTF-8 max string size. This conditional checking based on order of precedence is intentionally done so it
//prevents the while loop from continuing onwards if the iterators are outside of the intended sizes.
while (*utf8_currentCodeUnit && (utf16_str_iterator < utf16_str_output_size || utf8_str_iterator < utf8_str_size)) {
//Figure out the current code unit to determine the range. It is split into 6 main groups, each of which handles the data
//differently from one another.
if (*utf8_currentCodeUnit < 0x80) {
//0..127, the ASCII range.
//We directly plug in the values to the UTF-16 code unit.
*utf16_currentCodeUnit = (char16_t) (*utf8_currentCodeUnit);
utf16_currentCodeUnit++;
utf16_str_iterator++;
//Increment the current code unit pointer to the next code unit
utf8_currentCodeUnit++;
//Increment the iterator to keep track of where we are in the UTF-8 string
utf8_str_iterator++;
}
else if (*utf8_currentCodeUnit < 0xC0) {
//0x80..0xBF, we ignore. These are reserved for UTF-8 encoding.
utf8_currentCodeUnit++;
utf8_str_iterator++;
}
else if (*utf8_currentCodeUnit < 0xE0) {
//128..2047, the extended ASCII range, and into the Basic Multilingual Plane.
//Work on the first code unit.
char16_t highShort = (char16_t) ((*utf8_currentCodeUnit) & 0x1F);
//Increment the current code unit pointer to the next code unit
utf8_currentCodeUnit++;
//Work on the second code unit.
char16_t lowShort = (char16_t) ((*utf8_currentCodeUnit) & 0x3F);
//Increment the current code unit pointer to the next code unit
utf8_currentCodeUnit++;
//Create the UTF-16 code unit, then increment the iterator.
//Credits to @tbeu.
//Thanks to @k6l2 for explaining why we need 6 instead of 8.
//It's because 0x3F is 6 bits of information from the low short. By shifting 8 bits, you are
//adding 2 extra zeroes in between the actual data of both shorts.
int unicode = (highShort << 6) | lowShort;
//Check to make sure the "unicode" is in the range [0..D7FF] and [E000..FFFF].
if ((0 <= unicode && unicode <= 0xD7FF) || (0xE000 <= unicode && unicode <= 0xFFFF)) {
//Directly set the value to the UTF-16 code unit.
*utf16_currentCodeUnit = (char16_t) unicode;
utf16_currentCodeUnit++;
utf16_str_iterator++;
}
//Increment the iterator to keep track of where we are in the UTF-8 string
utf8_str_iterator += 2;
}
else if (*utf8_currentCodeUnit < 0xF0) {
//2048..65535, the remaining Basic Multilingual Plane.
//Work on the UTF-8 code units one by one.
//If drawn out, it would be 1110aaaa 10bbbbcc 10ccdddd
//Where a is 4th byte, b is 3rd byte, c is 2nd byte, and d is 1st byte.
char16_t fourthChar = (char16_t) ((*utf8_currentCodeUnit) & 0xF);
utf8_currentCodeUnit++;
char16_t thirdChar = (char16_t) ((*utf8_currentCodeUnit) & 0x3C) >> 2;
char16_t secondCharHigh = (char16_t) ((*utf8_currentCodeUnit) & 0x3);
utf8_currentCodeUnit++;
char16_t secondCharLow = (char16_t) ((*utf8_currentCodeUnit) & 0x30) >> 4;
char16_t firstChar = (char16_t) ((*utf8_currentCodeUnit) & 0xF);
utf8_currentCodeUnit++;
//Create the resulting UTF-16 code unit, then increment the iterator.
int unicode = (fourthChar << 12) | (thirdChar << 8) | (secondCharHigh << 6) | (secondCharLow << 4) | firstChar;
//Check to make sure the "unicode" is in the range [0..D7FF] and [E000..FFFF].
//According to math, UTF-8 encoded "unicode" should always fall within these two ranges.
if ((0 <= unicode && unicode <= 0xD7FF) || (0xE000 <= unicode && unicode <= 0xFFFF)) {
//Directly set the value to the UTF-16 code unit.
*utf16_currentCodeUnit = (char16_t) unicode;
utf16_currentCodeUnit++;
utf16_str_iterator++;
}
//Increment the iterator to keep track of where we are in the UTF-8 string
utf8_str_iterator += 3;
}
else if (*utf8_currentCodeUnit < 0xF8) {
//65536..10FFFF, the Unicode UTF range
//Work on the UTF-8 code units one by one.
//If drawn out, it would be 11110abb 10bbcccc 10ddddee 10eeffff
//Where a is 6th byte, b is 5th byte, c is 4th byte, and so on.
char16_t sixthChar = (char16_t) ((*utf8_currentCodeUnit) & 0x4) >> 2;
char16_t fifthCharHigh = (char16_t) ((*utf8_currentCodeUnit) & 0x3);
utf8_currentCodeUnit++;
char16_t fifthCharLow = (char16_t) ((*utf8_currentCodeUnit) & 0x30) >> 4;
char16_t fourthChar = (char16_t) ((*utf8_currentCodeUnit) & 0xF);
utf8_currentCodeUnit++;
char16_t thirdChar = (char16_t) ((*utf8_currentCodeUnit) & 0x3C) >> 2;
char16_t secondCharHigh = (char16_t) ((*utf8_currentCodeUnit) & 0x3);
utf8_currentCodeUnit++;
char16_t secondCharLow = (char16_t) ((*utf8_currentCodeUnit) & 0x30) >> 4;
char16_t firstChar = (char16_t) ((*utf8_currentCodeUnit) & 0xF);
utf8_currentCodeUnit++;
int unicode = (sixthChar << 4) | (fifthCharHigh << 2) | fifthCharLow | (fourthChar << 12) | (thirdChar << 8) | (secondCharHigh << 6) | (secondCharLow << 4) | firstChar;
char16_t highSurrogate = (unicode - 0x10000) / 0x400 + 0xD800;
char16_t lowSurrogate = (unicode - 0x10000) % 0x400 + 0xDC00;
//Set the UTF-16 code units
*utf16_currentCodeUnit = lowSurrogate;
utf16_currentCodeUnit++;
utf16_str_iterator++;
//Check to see if we're still below the output string size before continuing, otherwise, we cut off here.
if (utf16_str_iterator < utf16_str_output_size) {
*utf16_currentCodeUnit = highSurrogate;
utf16_currentCodeUnit++;
utf16_str_iterator++;
}
//Increment the iterator to keep track of where we are in the UTF-8 string
utf8_str_iterator += 4;
}
else {
//Invalid UTF-8 code unit, we ignore.
utf8_currentCodeUnit++;
utf8_str_iterator++;
}
}
//We clean up the output string if the UTF-16 iterator is still less than the output string size.
while (utf16_str_iterator < utf16_str_output_size) {
*utf16_currentCodeUnit = '\0';
utf16_currentCodeUnit++;
utf16_str_iterator++;
}
}
int main() {
unsigned char array[] = u8"我是誰?";
char16_t output[25];
utf8_to_utf16(&array[0], sizeof(array) / sizeof(array[0]), &output[0], sizeof(output) / sizeof(output[0]));
//Set debug breakpoint on the return statement, and inspect "output" variable.
return 0;
}
@tommai78101
Copy link
Author

tommai78101 commented Mar 17, 2021

@tbeu Could you explain why it's 6, and not 8? I thought the low and high shorts are 8 bits long?

@k6l2
Copy link

k6l2 commented Jun 22, 2022

@tbeu Could you explain why it's 6, and not 8? I thought the low and high shorts are 8 bits long?

It's because 0x3F is 6 bits of information from the low short. By shifting 8 bits, you are adding 2 extra zeroes in between the actual data of both shorts.

@tommai78101
Copy link
Author

Thank you. I would never be able to see the problem without that explanation.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment