Last active
April 19, 2016 16:10
-
-
Save yangacer/308ac63312c516c54c54b535eb8dc912 to your computer and use it in GitHub Desktop.
UTF8 to Unicode codepoint
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <iostream> | |
#include <string> | |
using namespace std; | |
uint32_t to_unicode(size_t *bytes, char const *utf8) | |
{ | |
if (!utf8 || !*utf8) { | |
*bytes = 0; | |
return 0; | |
} | |
uint32_t unicode = 0; | |
uint8_t first_byte = utf8[0]; | |
uint8_t len = | |
(first_byte >> 7) == 0 ? 1 : | |
(first_byte & 0xf0) == 0xf0 ? 4 : | |
(first_byte & 0xe0) == 0xe0 ? 3 : | |
(first_byte & 0xc0) == 0xc0 ? 2 : 0 | |
; | |
unicode += (uint8_t)(first_byte << len) >> len; | |
for(auto i = 1; utf8[i] && i < len; ++i) { | |
unicode <<= 6; | |
unicode += utf8[i] & 0x3F; | |
} | |
*bytes = len; | |
return unicode; | |
} | |
int main() { | |
char const *s = "中文測試a"; | |
size_t bytes = 0; | |
while(*s) { | |
uint32_t code = to_unicode(&bytes, s); | |
cout << code << " " << bytes << endl; | |
s += bytes; | |
} | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment