abdoei · October 3, 2024 23:49 · abdoei · Oct 3, 2024
diff --git a/converter.cpp b/converter.cpp
 #include <iostream>
 #include <iomanip>  // For std::hex
 #include <cstdint>  // For uint32_t


 void Read(const char8_t* ptr, char32_t &cp) {
    char32_t u1, u2, u3, u4;
    u1 = *ptr++;

    if (u1 <= 0x7F) {  // One byte code point
        cp = u1;
    } else if (u1 <= 0xDF) {  // Two bytes code point
        u2 = *ptr++;
        cp = ((u1 & 0x1F) << 6) | (u2 & 0x3F);
    } else if (u1 <= 0xEF) {  // Three bytes code point
        u2 = *ptr++;
        u3 = *ptr++;
        cp = ((u1 & 0x0F) << 12) | ((u2 & 0x3F) << 6) | (u3 & 0x3F);
    } else if (u1 <= 0xF7) {  // Four bytes code point
        u2 = *ptr++;
        u3 = *ptr++;
        u4 = *ptr++;
        cp = ((u1 & 0x07) << 18) | ((u2 & 0x3F) << 12) | ((u3 & 0x3F) << 6) | (u4 & 0x3F);
    }
 }

 // Function to print code points in hex format
 void printCodePoint(char32_t cp) {
    std::cout << "Code point: U+" << std::hex << std::uppercase << std::setfill('0') 
              << std::setw(4) << static_cast<uint32_t>(cp) << std::dec << std::endl;
 }


 using namespace std;

 int main() {
    const char8_t* testStr = u8"Hello, 世界, الحمد لله"; // Mixed ASCII and multi-byte UTF-8

    const char8_t* ptr = testStr;
    char32_t cp;
    
    std::cout << "Testing UTF-8 to UTF-32 conversion:" << std::endl;
    while (*ptr) {
        Read(ptr, cp);
        printCodePoint(cp);
        
        // Increment pointer according to the number of bytes processed
        if (cp <= 0x7F) {  // 1-byte character
            ptr += 1;
        } else if (cp <= 0x7FF) {  // 2-byte character
            ptr += 2;
        } else if (cp <= 0xFFFF) {  // 3-byte character
            ptr += 3;
        } else if (cp <= 0x10FFFF) {  // 4-byte character
            ptr += 4;
        }
    }

    return 0;
 }

 /*
 OUTPUT:

 Testing UTF-8 to UTF-32 conversion:
 Code point: U+0048
 Code point: U+0065
 Code point: U+006C
 Code point: U+006C
 Code point: U+006F
 Code point: U+002C
 Code point: U+0020
 Code point: U+4E16
 Code point: U+754C
 Code point: U+002C
 Code point: U+0020
 Code point: U+0627
 Code point: U+0644
 Code point: U+062D
 Code point: U+0645
 Code point: U+062F
 Code point: U+0020
 Code point: U+0644
 Code point: U+0644
 Code point: U+0647
 */
	#include <iostream>
	#include <iomanip> // For std::hex
	#include <cstdint> // For uint32_t


	void Read(const char8_t* ptr, char32_t &cp) {
	char32_t u1, u2, u3, u4;
	u1 = *ptr++;

	if (u1 <= 0x7F) { // One byte code point
	cp = u1;
	} else if (u1 <= 0xDF) { // Two bytes code point
	u2 = *ptr++;
	cp = ((u1 & 0x1F) << 6) \| (u2 & 0x3F);
	} else if (u1 <= 0xEF) { // Three bytes code point
	u2 = *ptr++;
	u3 = *ptr++;
	cp = ((u1 & 0x0F) << 12) \| ((u2 & 0x3F) << 6) \| (u3 & 0x3F);
	} else if (u1 <= 0xF7) { // Four bytes code point
	u2 = *ptr++;
	u3 = *ptr++;
	u4 = *ptr++;
	cp = ((u1 & 0x07) << 18) \| ((u2 & 0x3F) << 12) \| ((u3 & 0x3F) << 6) \| (u4 & 0x3F);
	}
	}

	// Function to print code points in hex format
	void printCodePoint(char32_t cp) {
	std::cout << "Code point: U+" << std::hex << std::uppercase << std::setfill('0')
	<< std::setw(4) << static_cast<uint32_t>(cp) << std::dec << std::endl;
	}


	using namespace std;

	int main() {
	const char8_t* testStr = u8"Hello, 世界, الحمد لله"; // Mixed ASCII and multi-byte UTF-8

	const char8_t* ptr = testStr;
	char32_t cp;

	std::cout << "Testing UTF-8 to UTF-32 conversion:" << std::endl;
	while (*ptr) {
	Read(ptr, cp);
	printCodePoint(cp);

	// Increment pointer according to the number of bytes processed
	if (cp <= 0x7F) { // 1-byte character
	ptr += 1;
	} else if (cp <= 0x7FF) { // 2-byte character
	ptr += 2;
	} else if (cp <= 0xFFFF) { // 3-byte character
	ptr += 3;
	} else if (cp <= 0x10FFFF) { // 4-byte character
	ptr += 4;
	}
	}

	return 0;
	}

	/*
	OUTPUT:

	Testing UTF-8 to UTF-32 conversion:
	Code point: U+0048
	Code point: U+0065
	Code point: U+006C
	Code point: U+006C
	Code point: U+006F
	Code point: U+002C
	Code point: U+0020
	Code point: U+4E16
	Code point: U+754C
	Code point: U+002C
	Code point: U+0020
	Code point: U+0627
	Code point: U+0644
	Code point: U+062D
	Code point: U+0645
	Code point: U+062F
	Code point: U+0020
	Code point: U+0644
	Code point: U+0644
	Code point: U+0647
	*/