Created
May 26, 2020 14:31
-
-
Save ryutorion/c6c6e3e3f45de5261de55a3407ec5d21 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <cstdint> | |
#include <cstdio> | |
inline bool isUTF8Tail(uint8_t c) | |
{ | |
return 0x80 <= c && c <= 0xBF; | |
} | |
int convertUTF8CharToUTF32Char(const char * p_utf8_str, char32_t & c) | |
{ | |
uint8_t c0 = static_cast<uint8_t>(p_utf8_str[0]); | |
if(c0 <= 0x7F) | |
{ | |
c = c0; | |
return 1; | |
} | |
if(c0 < 0xC2) | |
{ | |
return -1; | |
} | |
uint8_t c1 = static_cast<uint8_t>(p_utf8_str[1]); | |
if(c0 <= 0xDF) | |
{ | |
if(!isUTF8Tail(c1)) | |
{ | |
return -1; | |
} | |
c = (static_cast<char32_t>(c0) & 0x1F) << 6; | |
c |= c1 & 0x3F; | |
return 2; | |
} | |
uint8_t c2 = static_cast<uint8_t>(p_utf8_str[2]); | |
if(c0 == 0xE0) | |
{ | |
if(c1 < 0xA0 || 0xBF < c1 || !isUTF8Tail(c2)) | |
{ | |
return -1; | |
} | |
} | |
else if((0xE1 <= c0 && c0 <= 0xEC) || c0 == 0xEE || c0 == 0xEF) | |
{ | |
if(!isUTF8Tail(c1) || !isUTF8Tail(c2)) | |
{ | |
return -1; | |
} | |
} | |
else if(c0 == 0xED) | |
{ | |
if(c1 < 0x80 || 0x9F < c1 || !isUTF8Tail(c2)) | |
{ | |
return -1; | |
} | |
} | |
if(c0 < 0xF0) | |
{ | |
c = (static_cast<char32_t>(c0) & 0x0F) << 12; | |
c |= (static_cast<char32_t>(c1) & 0x3F) << 6; | |
c |= c2 & 0x3F; | |
return 3; | |
} | |
uint8_t c3 = static_cast<uint8_t>(p_utf8_str[3]); | |
if(c0 == 0xF0) | |
{ | |
if(c1 < 0x90 || 0xBF > c1 || !isUTF8Tail(c2) || !isUTF8Tail(c3)) | |
{ | |
return -1; | |
} | |
} | |
else if(c0 == 0xF1 || c0 == 0xF2 || c0 == 0xF3) | |
{ | |
if(!isUTF8Tail(c1) || !isUTF8Tail(c2) || !isUTF8Tail(c3)) | |
{ | |
return false; | |
} | |
} | |
else if(c0 == 0xF4) | |
{ | |
if(c1 < 0x80 || 0x8F > c1 || !isUTF8Tail(c2) || !isUTF8Tail(c3)) | |
{ | |
return -1; | |
} | |
} | |
if(c0 <= 0xF4) | |
{ | |
c = (static_cast<char32_t>(c0) & 0x07) << 18; | |
c |= (static_cast<char32_t>(c1) & 0x3F) << 12; | |
c |= (static_cast<char32_t>(c2) & 0x3F) << 6; | |
c |= c3 & 0x3F; | |
return 4; | |
} | |
return -1; | |
} | |
int main(int argc, char * argv[]) | |
{ | |
char s[] = u8"こんにちは"; | |
char32_t s32[] = U"こんにちは"; | |
int index = 0; | |
char * p = s; | |
while(*p) | |
{ | |
char32_t c; | |
int length = convertUTF8CharToUTF32Char(p, c); | |
if(length <= 0) | |
{ | |
return -1; | |
} | |
if(s32[index] == c) | |
{ | |
printf("s32[%d] match\n", index); | |
} | |
else | |
{ | |
printf("s32[%d] unmatch\n", index); | |
} | |
p += length; | |
++index; | |
} | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment