Created
October 3, 2024 23:49
-
-
Save abdoei/54d2fa702544ca891c47b805d3518d8a to your computer and use it in GitHub Desktop.
UTF-8 to UTF-32 simple implementation
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <iostream> | |
#include <iomanip> // For std::hex | |
#include <cstdint> // For uint32_t | |
void Read(const char8_t* ptr, char32_t &cp) { | |
char32_t u1, u2, u3, u4; | |
u1 = *ptr++; | |
if (u1 <= 0x7F) { // One byte code point | |
cp = u1; | |
} else if (u1 <= 0xDF) { // Two bytes code point | |
u2 = *ptr++; | |
cp = ((u1 & 0x1F) << 6) | (u2 & 0x3F); | |
} else if (u1 <= 0xEF) { // Three bytes code point | |
u2 = *ptr++; | |
u3 = *ptr++; | |
cp = ((u1 & 0x0F) << 12) | ((u2 & 0x3F) << 6) | (u3 & 0x3F); | |
} else if (u1 <= 0xF7) { // Four bytes code point | |
u2 = *ptr++; | |
u3 = *ptr++; | |
u4 = *ptr++; | |
cp = ((u1 & 0x07) << 18) | ((u2 & 0x3F) << 12) | ((u3 & 0x3F) << 6) | (u4 & 0x3F); | |
} | |
} | |
// Function to print code points in hex format | |
void printCodePoint(char32_t cp) { | |
std::cout << "Code point: U+" << std::hex << std::uppercase << std::setfill('0') | |
<< std::setw(4) << static_cast<uint32_t>(cp) << std::dec << std::endl; | |
} | |
using namespace std; | |
int main() { | |
const char8_t* testStr = u8"Hello, 世界, الحمد لله"; // Mixed ASCII and multi-byte UTF-8 | |
const char8_t* ptr = testStr; | |
char32_t cp; | |
std::cout << "Testing UTF-8 to UTF-32 conversion:" << std::endl; | |
while (*ptr) { | |
Read(ptr, cp); | |
printCodePoint(cp); | |
// Increment pointer according to the number of bytes processed | |
if (cp <= 0x7F) { // 1-byte character | |
ptr += 1; | |
} else if (cp <= 0x7FF) { // 2-byte character | |
ptr += 2; | |
} else if (cp <= 0xFFFF) { // 3-byte character | |
ptr += 3; | |
} else if (cp <= 0x10FFFF) { // 4-byte character | |
ptr += 4; | |
} | |
} | |
return 0; | |
} | |
/* | |
OUTPUT: | |
Testing UTF-8 to UTF-32 conversion: | |
Code point: U+0048 | |
Code point: U+0065 | |
Code point: U+006C | |
Code point: U+006C | |
Code point: U+006F | |
Code point: U+002C | |
Code point: U+0020 | |
Code point: U+4E16 | |
Code point: U+754C | |
Code point: U+002C | |
Code point: U+0020 | |
Code point: U+0627 | |
Code point: U+0644 | |
Code point: U+062D | |
Code point: U+0645 | |
Code point: U+062F | |
Code point: U+0020 | |
Code point: U+0644 | |
Code point: U+0644 | |
Code point: U+0647 | |
*/ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
WIKI: UTF-8 encodes code points in one to four bytes, depending on the value of the code point. In the following table, the characters u to z are replaced by the bits of the code point, from the positions U+uvwxyz:
