Skip to content

Instantly share code, notes, and snippets.

@KoKuToru
Created May 1, 2022 14:22
Show Gist options
  • Save KoKuToru/842c933a937b898c3a1dae4e0ed5a024 to your computer and use it in GitHub Desktop.
Save KoKuToru/842c933a937b898c3a1dae4e0ed5a024 to your computer and use it in GitHub Desktop.
Convert UTF8 to UTF16
#include <string>
#include <string_view>
#include <iostream>
#include <tuple>
std::tuple<std::u8string_view, char32_t> decode_u8_charpoint(std::u8string_view input) {
// XXX: silently accept invalid UTF8
if ((input.at(0) & 0b10000000) == 0b00000000) {
return {
input.substr(1),
input.at(0) & 0b01111111
};
} else if ((input.at(0) & 0b11100000) == 0b11000000) {
return {
input.substr(2),
(
((input.at(0) & 0b00011111) << 6) |
((input.at(1) & 0b00111111) << 0)
)
};
} else if ((input.at(0) & 0b11110000) == 0b11100000) {
return {
input.substr(3),
(
((input.at(0) & 0b00001111) << 12) |
((input.at(1) & 0b00111111) << 6) |
((input.at(2) & 0b00111111) << 0)
)
};
} else if ((input[0] & 0b11111000) == 0b11110000) {
return {
input.substr(4),
(
((input.at(0) & 0b00001111) << 18) |
((input.at(1) & 0b00111111) << 12) |
((input.at(2) & 0b00111111) << 6) |
((input.at(3) & 0b00111111) << 0)
)
};
}
return { input.substr(0, 0), -1 };
}
std::u16string from_utf8(std::u8string_view input) {
int len = 0;
std::u8string_view tinput = input;
while (!tinput.empty()) {
auto [ninput, code] = decode_u8_charpoint(input);
tinput = ninput;
if (int(code) < 0) {
throw new std::runtime_error("Wrong UTF8 Encoding");
}
if (code >= 0 && code <= 0xD7FF) [[likely]] {
len += 1;
continue;
}
if (code >= 0xE000 && code <= 0xFFFF) {
len += 1;
continue;
}
if ((code >= 0xD800 && code <= 0xDBFF) || (code >= 0xDC00 && code <= 0xDFFF)) [[unlikely]] {
throw new std::runtime_error("Reserved High-Surrogates and Low-Surrogates");
}
len += 2;
}
std::u16string res;
res.reserve(len + 1);
tinput = input;
while (!tinput.empty()) {
auto [ninput, code] = decode_u8_charpoint(tinput);
tinput = ninput;
if (code >= 0 && code <= 0xD7FF) [[likely]] {
res += code;
continue;
}
if (code >= 0xE000 && code <= 0xFFFF) {
res += code;
continue;
}
auto sg1 = (code - 0x10000) / 0x400 + 0xD800;
auto sg2 = code % 0x400 + 0xDC00;
res += sg1;
res += sg2;
}
return res;
}
int main() {
{
std::u8string_view u8view = u8"π„ž";
std::u16string_view u16view = u"π„ž";
std::string_view view = "π„ž";
std::cout << "u8 " << u8view.size() << " " << u8view.length() << std::endl;
std::cout << "u16 " << u16view.size() << " " << u16view.length() << std::endl;
std::cout << " " << view.size() << " " << view.length() << std::endl;
}
{
std::u8string u8view = u8"π„ž";
std::u16string u16view = u"π„ž";
std::string view = "π„ž";
std::u16string u16_v2 = from_utf8(u8"π„ž");
std::cout << "u8 " << u8view.size() << " " << u8view.length() << std::endl;
std::cout << "u16 " << u16view.size() << " " << u16view.length() << std::endl;
std::cout << " " << view.size() << " " << view.length() << std::endl;
std::cout << "u16_v2 " << u16_v2.size() << " " << u16_v2.length() << std::endl;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment