Last active
June 9, 2021 03:17
-
-
Save nullhook/769f54b3edfcc8d2b2db6582914497d3 to your computer and use it in GitHub Desktop.
char type conversions in c++
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <iostream> | |
#include <locale> | |
#include <string> | |
#include <fstream> | |
#include <codecvt> | |
// utf8/utf16/utf32 can be directly written to file without conversions | |
// sizeof(T) gives you bytes of the type | |
// .size() .length() gives count of chars | |
// if char16_t is stored the open the file with utf16 encoding | |
// utf8 is slowly becoming the standard; MacRoman was apple's default but now it's utf8 | |
// 'locale' can give you system's default local language, curr, date settings | |
// you need to know the encoding prior to decoding | |
// you can convert a char16_t to machine's default charset by using std::locale | |
// utf16 stores less char bytes vs utf8 | |
// endiness is byte ordering; little endiness means bytes will end with small number | |
// utf8 is comaptible with basic ascii and they're of single byte length and most significant bit is always 0 | |
// basic ascii is only 0-127, but there are about 2^23 possible in utf8 | |
// compiler: to calculate the byte length, or copy a utf8 string, it doesn't need to know about utf8 | |
// compiler: to calculate the number of code points, or to split a string correctly, it does need to know about utf8 | |
// splitting a string is the better example here. If you're interpreting it as ascii, but it actually has multi-byte utf8 characters in it, you can split in the middle of a code point by accident and produce two invalid or incorrect utf8 strings | |
int main() { | |
std::u16string u16str = u"ßx"; | |
// convert u16 to u8. you can imbue it also! | |
std::string u8conv = std::wstring_convert<std::codecvt_utf8_utf16<char16_t>, char16_t>{}.to_bytes(u16str); | |
std::string u8str = u8"ßx"; | |
std::cout << "\n"; | |
std::cout << "u16 Type size: " << sizeof(char16_t) << "\n"; | |
std::cout << "u16 String size: " << u16str.size() << "\n"; | |
std::cout << "u16->u8 String size: " << u8conv.size() << "\n"; | |
std::cout << "u16 pointer address: " << u16str.c_str() << "\n"; | |
std::cout << "UTF-16 produced: "; | |
for(char16_t c : u16str) | |
std::cout << std::hex << std::showbase << c << ' '; | |
std::cout << "\n"; | |
std::cout << "\n"; | |
std::cout << "u8 Type size: " << sizeof(u8str) << "\n"; | |
std::cout << "u8 String size: " << u8str.size() << "\n"; | |
std::cout << "u8 pointer: " << u8str.c_str() << "\n"; | |
std::cout << "UTF-16 to UTF-8 conversion produced: "; | |
for(unsigned char c : u8conv) /* char is signed, so numbers >127 are effectively negative numbers */ | |
std::cout << std::hex << std::showbase << +(c) << ' '; /* dont print leading zeros */ | |
std::cout << "\n"; | |
std::ofstream file("from_utf16.txt"); | |
file.write(u8conv.c_str(), sizeof(char)*u8conv.size()); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment