Skip to content

Instantly share code, notes, and snippets.

@abdoei
Created October 3, 2024 23:49
Show Gist options
  • Save abdoei/54d2fa702544ca891c47b805d3518d8a to your computer and use it in GitHub Desktop.
Save abdoei/54d2fa702544ca891c47b805d3518d8a to your computer and use it in GitHub Desktop.
UTF-8 to UTF-32 simple implementation
#include <iostream>
#include <iomanip> // For std::hex
#include <cstdint> // For uint32_t
void Read(const char8_t* ptr, char32_t &cp) {
char32_t u1, u2, u3, u4;
u1 = *ptr++;
if (u1 <= 0x7F) { // One byte code point
cp = u1;
} else if (u1 <= 0xDF) { // Two bytes code point
u2 = *ptr++;
cp = ((u1 & 0x1F) << 6) | (u2 & 0x3F);
} else if (u1 <= 0xEF) { // Three bytes code point
u2 = *ptr++;
u3 = *ptr++;
cp = ((u1 & 0x0F) << 12) | ((u2 & 0x3F) << 6) | (u3 & 0x3F);
} else if (u1 <= 0xF7) { // Four bytes code point
u2 = *ptr++;
u3 = *ptr++;
u4 = *ptr++;
cp = ((u1 & 0x07) << 18) | ((u2 & 0x3F) << 12) | ((u3 & 0x3F) << 6) | (u4 & 0x3F);
}
}
// Function to print code points in hex format
void printCodePoint(char32_t cp) {
std::cout << "Code point: U+" << std::hex << std::uppercase << std::setfill('0')
<< std::setw(4) << static_cast<uint32_t>(cp) << std::dec << std::endl;
}
using namespace std;
int main() {
const char8_t* testStr = u8"Hello, 世界, الحمد لله"; // Mixed ASCII and multi-byte UTF-8
const char8_t* ptr = testStr;
char32_t cp;
std::cout << "Testing UTF-8 to UTF-32 conversion:" << std::endl;
while (*ptr) {
Read(ptr, cp);
printCodePoint(cp);
// Increment pointer according to the number of bytes processed
if (cp <= 0x7F) { // 1-byte character
ptr += 1;
} else if (cp <= 0x7FF) { // 2-byte character
ptr += 2;
} else if (cp <= 0xFFFF) { // 3-byte character
ptr += 3;
} else if (cp <= 0x10FFFF) { // 4-byte character
ptr += 4;
}
}
return 0;
}
/*
OUTPUT:
Testing UTF-8 to UTF-32 conversion:
Code point: U+0048
Code point: U+0065
Code point: U+006C
Code point: U+006C
Code point: U+006F
Code point: U+002C
Code point: U+0020
Code point: U+4E16
Code point: U+754C
Code point: U+002C
Code point: U+0020
Code point: U+0627
Code point: U+0644
Code point: U+062D
Code point: U+0645
Code point: U+062F
Code point: U+0020
Code point: U+0644
Code point: U+0644
Code point: U+0647
*/
@abdoei
Copy link
Author

abdoei commented Oct 3, 2024

WIKI: UTF-8 encodes code points in one to four bytes, depending on the value of the code point. In the following table, the characters u to z are replaced by the bits of the code point, from the positions U+uvwxyz:
image

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment