Skip to content

Instantly share code, notes, and snippets.

@yangacer
Last active April 19, 2016 16:10
Show Gist options
  • Save yangacer/308ac63312c516c54c54b535eb8dc912 to your computer and use it in GitHub Desktop.
Save yangacer/308ac63312c516c54c54b535eb8dc912 to your computer and use it in GitHub Desktop.
UTF8 to Unicode codepoint
#include <iostream>
#include <string>
using namespace std;
uint32_t to_unicode(size_t *bytes, char const *utf8)
{
if (!utf8 || !*utf8) {
*bytes = 0;
return 0;
}
uint32_t unicode = 0;
uint8_t first_byte = utf8[0];
uint8_t len =
(first_byte >> 7) == 0 ? 1 :
(first_byte & 0xf0) == 0xf0 ? 4 :
(first_byte & 0xe0) == 0xe0 ? 3 :
(first_byte & 0xc0) == 0xc0 ? 2 : 0
;
unicode += (uint8_t)(first_byte << len) >> len;
for(auto i = 1; utf8[i] && i < len; ++i) {
unicode <<= 6;
unicode += utf8[i] & 0x3F;
}
*bytes = len;
return unicode;
}
int main() {
char const *s = "中文測試a";
size_t bytes = 0;
while(*s) {
uint32_t code = to_unicode(&bytes, s);
cout << code << " " << bytes << endl;
s += bytes;
}
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment