Skip to content

Instantly share code, notes, and snippets.

@max-dark
Created December 1, 2024 02:43
Show Gist options
  • Save max-dark/8922cd047164091c4053261a9df9fb7c to your computer and use it in GitHub Desktop.
Save max-dark/8922cd047164091c4053261a9df9fb7c to your computer and use it in GitHub Desktop.
utf decode
// https://en.wikipedia.org/wiki/UTF-8
#include <iostream>
#include <string>
#include <iomanip>
using u_char = unsigned char;
enum utf_mask: u_char
{
char_part = 0b10000000,
part_mask = 0b11000000,
n1_value = 0b00000000,
n1_mask = 0b10000000,
n2_value = 0b11000000,
n2_mask = 0b11100000,
n3_value = 0b11100000,
n3_mask = 0b11110000,
n4_value = 0b11110000,
n4_mask = 0b11111000,
};
bool eq(char c, u_char v, u_char m)
{
return v == (c & m);
}
bool is_char_part(char c)
{
return eq(c, char_part, part_mask);
}
int get_size(char c)
{
if (is_char_part(c)) return -1;
if (eq(c, n1_value, n1_mask)) return 1;
if (eq(c, n2_value, n2_mask)) return 2;
if (eq(c, n3_value, n3_mask)) return 3;
if (eq(c, n4_value, n4_mask)) return 4;
return -2;
}
int main()
{
using namespace std::literals;
std::string str = u8"абв";
for (size_t i = 0; i < str.size();)
{
char c = str[i];
int n = get_size(c);
if (n <= 0)
{
std::cout << "<?>" << std::endl;
++i;
}
else
{
std::cout << str.substr(i, n) << std::endl;
i += n;
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment