tommai78101 · June 4, 2025 01:28 · tommai78101 · Mar 17, 2021 · k6l2 · Jun 22, 2022
diff --git a/unicode.c b/unicode.c
 #include <stdio.h>
 #include <stdlib.h>
 #include <uchar.h>
 #include <locale.h>

 #define __STD_UTF_16__

 //Pointer arrays must always include the array size, because pointers do not know about the size of the supposed array size.
 void utf8_to_utf16(unsigned char* const utf8_str, int utf8_str_size, char16_t* utf16_str_output, int utf16_str_output_size) {
 	//First, grab the first byte of the UTF-8 string
 	unsigned char* utf8_currentCodeUnit = utf8_str;
 	char16_t* utf16_currentCodeUnit = utf16_str_output;
 	int utf8_str_iterator = 0;
 	int utf16_str_iterator = 0;

 	//In a while loop, we check if the UTF-16 iterator is less than the max output size. If true, then we check if UTF-8 iterator
 	//is less than UTF-8 max string size. This conditional checking based on order of precedence is intentionally done so it
 	//prevents the while loop from continuing onwards if the iterators are outside of the intended sizes.
 	while (*utf8_currentCodeUnit && (utf16_str_iterator < utf16_str_output_size || utf8_str_iterator < utf8_str_size)) {
 		//Figure out the current code unit to determine the range. It is split into 6 main groups, each of which handles the data
 		//differently from one another.
 		if (*utf8_currentCodeUnit < 0x80) {
 			//0..127, the ASCII range.

 			//We directly plug in the values to the UTF-16 code unit.
 			*utf16_currentCodeUnit = (char16_t) (*utf8_currentCodeUnit);
 			utf16_currentCodeUnit++;
 			utf16_str_iterator++;

 			//Increment the current code unit pointer to the next code unit
 			utf8_currentCodeUnit++;

 			//Increment the iterator to keep track of where we are in the UTF-8 string
 			utf8_str_iterator++;
 		}
 		else if (*utf8_currentCodeUnit < 0xC0) {
 			//0x80..0xBF, we ignore. These are reserved for UTF-8 encoding.
 			utf8_currentCodeUnit++;
 			utf8_str_iterator++;
 		}
 		else if (*utf8_currentCodeUnit < 0xE0) {
 			//128..2047, the extended ASCII range, and into the Basic Multilingual Plane.

 			//Work on the first code unit.
 			char16_t highShort = (char16_t) ((*utf8_currentCodeUnit) & 0x1F);

 			//Increment the current code unit pointer to the next code unit
 			utf8_currentCodeUnit++;

 			//Work on the second code unit.
 			char16_t lowShort = (char16_t) ((*utf8_currentCodeUnit) & 0x3F);

 			//Increment the current code unit pointer to the next code unit
 			utf8_currentCodeUnit++;

 			//Create the UTF-16 code unit, then increment the iterator.
 			//Credits to @tbeu. 
 			//Thanks to @k6l2 for explaining why we need 6 instead of 8.
 			//It's because 0x3F is 6 bits of information from the low short. By shifting 8 bits, you are 
 			//adding 2 extra zeroes in between the actual data of both shorts.
 			int unicode = (highShort << 6) | lowShort;

 			//Check to make sure the "unicode" is in the range [0..D7FF] and [E000..FFFF].
 			if ((0 <= unicode && unicode <= 0xD7FF) || (0xE000 <= unicode && unicode <= 0xFFFF)) {
 				//Directly set the value to the UTF-16 code unit.
 				*utf16_currentCodeUnit = (char16_t) unicode;
 				utf16_currentCodeUnit++;
 				utf16_str_iterator++;
 			}

 			//Increment the iterator to keep track of where we are in the UTF-8 string
 			utf8_str_iterator += 2;
 		}
 		else if (*utf8_currentCodeUnit < 0xF0) {
 			//2048..65535, the remaining Basic Multilingual Plane.

 			//Work on the UTF-8 code units one by one.
 			//If drawn out, it would be 1110aaaa 10bbbbcc 10ccdddd
 			//Where a is 4th byte, b is 3rd byte, c is 2nd byte, and d is 1st byte.
 			char16_t fourthChar = (char16_t) ((*utf8_currentCodeUnit) & 0xF);
 			utf8_currentCodeUnit++;
 			char16_t thirdChar = (char16_t) ((*utf8_currentCodeUnit) & 0x3C) >> 2;
 			char16_t secondCharHigh = (char16_t) ((*utf8_currentCodeUnit) & 0x3);
 			utf8_currentCodeUnit++;
 			char16_t secondCharLow = (char16_t) ((*utf8_currentCodeUnit) & 0x30) >> 4;
 			char16_t firstChar = (char16_t) ((*utf8_currentCodeUnit) & 0xF);
 			utf8_currentCodeUnit++;

 			//Create the resulting UTF-16 code unit, then increment the iterator.
 			int unicode = (fourthChar << 12) | (thirdChar << 8) | (secondCharHigh << 6) | (secondCharLow << 4) | firstChar;

 			//Check to make sure the "unicode" is in the range [0..D7FF] and [E000..FFFF].
 			//According to math, UTF-8 encoded "unicode" should always fall within these two ranges.
 			if ((0 <= unicode && unicode <= 0xD7FF) || (0xE000 <= unicode && unicode <= 0xFFFF)) {
 				//Directly set the value to the UTF-16 code unit.
 				*utf16_currentCodeUnit = (char16_t) unicode;
 				utf16_currentCodeUnit++;
 				utf16_str_iterator++;
 			}

 			//Increment the iterator to keep track of where we are in the UTF-8 string
 			utf8_str_iterator += 3;
 		}
 		else if (*utf8_currentCodeUnit < 0xF8) {
 			//65536..10FFFF, the Unicode UTF range

 			//Work on the UTF-8 code units one by one.
 			//If drawn out, it would be 11110abb 10bbcccc 10ddddee 10eeffff
 			//Where a is 6th byte, b is 5th byte, c is 4th byte, and so on.
 			char16_t sixthChar = (char16_t) ((*utf8_currentCodeUnit) & 0x4) >> 2;
 			char16_t fifthCharHigh = (char16_t) ((*utf8_currentCodeUnit) & 0x3);
 			utf8_currentCodeUnit++;
 			char16_t fifthCharLow = (char16_t) ((*utf8_currentCodeUnit) & 0x30) >> 4;
 			char16_t fourthChar = (char16_t) ((*utf8_currentCodeUnit) & 0xF);
 			utf8_currentCodeUnit++;
 			char16_t thirdChar = (char16_t) ((*utf8_currentCodeUnit) & 0x3C) >> 2;
 			char16_t secondCharHigh = (char16_t) ((*utf8_currentCodeUnit) & 0x3);
 			utf8_currentCodeUnit++;
 			char16_t secondCharLow = (char16_t) ((*utf8_currentCodeUnit) & 0x30) >> 4;
 			char16_t firstChar = (char16_t) ((*utf8_currentCodeUnit) & 0xF);
 			utf8_currentCodeUnit++;

 			int unicode = (sixthChar << 4) | (fifthCharHigh << 2) | fifthCharLow | (fourthChar << 12) | (thirdChar << 8) | (secondCharHigh << 6) | (secondCharLow << 4) | firstChar;
 			char16_t highSurrogate = (unicode - 0x10000) / 0x400 + 0xD800;
 			char16_t lowSurrogate = (unicode - 0x10000) % 0x400 + 0xDC00;

 			//Set the UTF-16 code units
 			*utf16_currentCodeUnit = lowSurrogate;
 			utf16_currentCodeUnit++;
 			utf16_str_iterator++;

 			//Check to see if we're still below the output string size before continuing, otherwise, we cut off here.
 			if (utf16_str_iterator < utf16_str_output_size) {
 				*utf16_currentCodeUnit = highSurrogate;
 				utf16_currentCodeUnit++;
 				utf16_str_iterator++;
 			}

 			//Increment the iterator to keep track of where we are in the UTF-8 string
 			utf8_str_iterator += 4;
 		}
 		else {
 			//Invalid UTF-8 code unit, we ignore.
 			utf8_currentCodeUnit++;
 			utf8_str_iterator++;
 		}
 	}

 	//We clean up the output string if the UTF-16 iterator is still less than the output string size.
 	while (utf16_str_iterator < utf16_str_output_size) {
 		*utf16_currentCodeUnit = '\0';
 		utf16_currentCodeUnit++;
 		utf16_str_iterator++;
 	}
 }

 int main() {
 	unsigned char array[] = u8"我是誰?";
 	char16_t output[25];

 	utf8_to_utf16(&array[0], sizeof(array) / sizeof(array[0]), &output[0], sizeof(output) / sizeof(output[0]));

 	//Set debug breakpoint on the return statement, and inspect "output" variable.
 	return 0;
 }
	#include <stdio.h>
	#include <stdlib.h>
	#include <uchar.h>
	#include <locale.h>

	#define __STD_UTF_16__

	//Pointer arrays must always include the array size, because pointers do not know about the size of the supposed array size.
	void utf8_to_utf16(unsigned char* const utf8_str, int utf8_str_size, char16_t* utf16_str_output, int utf16_str_output_size) {
	//First, grab the first byte of the UTF-8 string
	unsigned char* utf8_currentCodeUnit = utf8_str;
	char16_t* utf16_currentCodeUnit = utf16_str_output;
	int utf8_str_iterator = 0;
	int utf16_str_iterator = 0;

	//In a while loop, we check if the UTF-16 iterator is less than the max output size. If true, then we check if UTF-8 iterator
	//is less than UTF-8 max string size. This conditional checking based on order of precedence is intentionally done so it
	//prevents the while loop from continuing onwards if the iterators are outside of the intended sizes.
	while (*utf8_currentCodeUnit && (utf16_str_iterator < utf16_str_output_size \|\| utf8_str_iterator < utf8_str_size)) {
	//Figure out the current code unit to determine the range. It is split into 6 main groups, each of which handles the data
	//differently from one another.
	if (*utf8_currentCodeUnit < 0x80) {
	//0..127, the ASCII range.

	//We directly plug in the values to the UTF-16 code unit.
	utf16_currentCodeUnit = (char16_t) (utf8_currentCodeUnit);
	utf16_currentCodeUnit++;
	utf16_str_iterator++;

	//Increment the current code unit pointer to the next code unit
	utf8_currentCodeUnit++;

	//Increment the iterator to keep track of where we are in the UTF-8 string
	utf8_str_iterator++;
	}
	else if (*utf8_currentCodeUnit < 0xC0) {
	//0x80..0xBF, we ignore. These are reserved for UTF-8 encoding.
	utf8_currentCodeUnit++;
	utf8_str_iterator++;
	}
	else if (*utf8_currentCodeUnit < 0xE0) {
	//128..2047, the extended ASCII range, and into the Basic Multilingual Plane.

	//Work on the first code unit.
	char16_t highShort = (char16_t) ((*utf8_currentCodeUnit) & 0x1F);

	//Increment the current code unit pointer to the next code unit
	utf8_currentCodeUnit++;

	//Work on the second code unit.
	char16_t lowShort = (char16_t) ((*utf8_currentCodeUnit) & 0x3F);

	//Increment the current code unit pointer to the next code unit
	utf8_currentCodeUnit++;

	//Create the UTF-16 code unit, then increment the iterator.
	//Credits to @tbeu.
	//Thanks to @k6l2 for explaining why we need 6 instead of 8.
	//It's because 0x3F is 6 bits of information from the low short. By shifting 8 bits, you are
	//adding 2 extra zeroes in between the actual data of both shorts.
	int unicode = (highShort << 6) \| lowShort;

	//Check to make sure the "unicode" is in the range [0..D7FF] and [E000..FFFF].
	if ((0 <= unicode && unicode <= 0xD7FF) \|\| (0xE000 <= unicode && unicode <= 0xFFFF)) {
	//Directly set the value to the UTF-16 code unit.
	*utf16_currentCodeUnit = (char16_t) unicode;
	utf16_currentCodeUnit++;
	utf16_str_iterator++;
	}

	//Increment the iterator to keep track of where we are in the UTF-8 string
	utf8_str_iterator += 2;
	}
	else if (*utf8_currentCodeUnit < 0xF0) {
	//2048..65535, the remaining Basic Multilingual Plane.

	//Work on the UTF-8 code units one by one.
	//If drawn out, it would be 1110aaaa 10bbbbcc 10ccdddd
	//Where a is 4th byte, b is 3rd byte, c is 2nd byte, and d is 1st byte.
	char16_t fourthChar = (char16_t) ((*utf8_currentCodeUnit) & 0xF);
	utf8_currentCodeUnit++;
	char16_t thirdChar = (char16_t) ((*utf8_currentCodeUnit) & 0x3C) >> 2;
	char16_t secondCharHigh = (char16_t) ((*utf8_currentCodeUnit) & 0x3);
	utf8_currentCodeUnit++;
	char16_t secondCharLow = (char16_t) ((*utf8_currentCodeUnit) & 0x30) >> 4;
	char16_t firstChar = (char16_t) ((*utf8_currentCodeUnit) & 0xF);
	utf8_currentCodeUnit++;

	//Create the resulting UTF-16 code unit, then increment the iterator.
	int unicode = (fourthChar << 12) \| (thirdChar << 8) \| (secondCharHigh << 6) \| (secondCharLow << 4) \| firstChar;

	//Check to make sure the "unicode" is in the range [0..D7FF] and [E000..FFFF].
	//According to math, UTF-8 encoded "unicode" should always fall within these two ranges.
	if ((0 <= unicode && unicode <= 0xD7FF) \|\| (0xE000 <= unicode && unicode <= 0xFFFF)) {
	//Directly set the value to the UTF-16 code unit.
	*utf16_currentCodeUnit = (char16_t) unicode;
	utf16_currentCodeUnit++;
	utf16_str_iterator++;
	}

	//Increment the iterator to keep track of where we are in the UTF-8 string
	utf8_str_iterator += 3;
	}
	else if (*utf8_currentCodeUnit < 0xF8) {
	//65536..10FFFF, the Unicode UTF range

	//Work on the UTF-8 code units one by one.
	//If drawn out, it would be 11110abb 10bbcccc 10ddddee 10eeffff
	//Where a is 6th byte, b is 5th byte, c is 4th byte, and so on.
	char16_t sixthChar = (char16_t) ((*utf8_currentCodeUnit) & 0x4) >> 2;
	char16_t fifthCharHigh = (char16_t) ((*utf8_currentCodeUnit) & 0x3);
	utf8_currentCodeUnit++;
	char16_t fifthCharLow = (char16_t) ((*utf8_currentCodeUnit) & 0x30) >> 4;
	char16_t fourthChar = (char16_t) ((*utf8_currentCodeUnit) & 0xF);
	utf8_currentCodeUnit++;
	char16_t thirdChar = (char16_t) ((*utf8_currentCodeUnit) & 0x3C) >> 2;
	char16_t secondCharHigh = (char16_t) ((*utf8_currentCodeUnit) & 0x3);
	utf8_currentCodeUnit++;
	char16_t secondCharLow = (char16_t) ((*utf8_currentCodeUnit) & 0x30) >> 4;
	char16_t firstChar = (char16_t) ((*utf8_currentCodeUnit) & 0xF);
	utf8_currentCodeUnit++;

	int unicode = (sixthChar << 4) \| (fifthCharHigh << 2) \| fifthCharLow \| (fourthChar << 12) \| (thirdChar << 8) \| (secondCharHigh << 6) \| (secondCharLow << 4) \| firstChar;
	char16_t highSurrogate = (unicode - 0x10000) / 0x400 + 0xD800;
	char16_t lowSurrogate = (unicode - 0x10000) % 0x400 + 0xDC00;

	//Set the UTF-16 code units
	*utf16_currentCodeUnit = lowSurrogate;
	utf16_currentCodeUnit++;
	utf16_str_iterator++;

	//Check to see if we're still below the output string size before continuing, otherwise, we cut off here.
	if (utf16_str_iterator < utf16_str_output_size) {
	*utf16_currentCodeUnit = highSurrogate;
	utf16_currentCodeUnit++;
	utf16_str_iterator++;
	}

	//Increment the iterator to keep track of where we are in the UTF-8 string
	utf8_str_iterator += 4;
	}
	else {
	//Invalid UTF-8 code unit, we ignore.
	utf8_currentCodeUnit++;
	utf8_str_iterator++;
	}
	}

	//We clean up the output string if the UTF-16 iterator is still less than the output string size.
	while (utf16_str_iterator < utf16_str_output_size) {
	*utf16_currentCodeUnit = '\0';
	utf16_currentCodeUnit++;
	utf16_str_iterator++;
	}
	}

	int main() {
	unsigned char array[] = u8"我是誰?";
	char16_t output[25];

	utf8_to_utf16(&array[0], sizeof(array) / sizeof(array[0]), &output[0], sizeof(output) / sizeof(output[0]));

	//Set debug breakpoint on the return statement, and inspect "output" variable.
	return 0;
	}