Skip to content

Instantly share code, notes, and snippets.

@paulsmith
Created April 25, 2018 14:16
Show Gist options
  • Save paulsmith/203ef5654d8fb5ebb8d3776befd9fb5b to your computer and use it in GitHub Desktop.
Save paulsmith/203ef5654d8fb5ebb8d3776befd9fb5b to your computer and use it in GitHub Desktop.
utf-8 decoder
/* decodes next unicode code point in utf-8-encoded buffer. returns number of bytes
read so stream can be advanced, -1 if invalid utf-8 sequence. */
size_t decode_next_utf8(const unsigned char *str, size_t len, int *cp)
{
*cp = 0;
if (*str <= 0x7f) {
*cp = (int)*str;
return 1;
} else if (((*str & 0xe0) == 0xc0) && len > 1) {
if ((*(str+1) & 0xc0) != 0x80) {
return -1;
}
*cp |= (*str++ & 0x1f) << 6;
*cp |= (*str & 0x3f) << 0;
return 2;
} else if (((*str & 0xf0) == 0xe0) && len > 2) {
if (((*(str+1) & 0xc0) != 0x80) || (*(str+2) & 0xc0) != 0x80) {
return -1;
}
*cp |= (*str++ & 0xf) << 12;
*cp |= (*str++ & 0x1f) << 6;
*cp |= (*str & 0x3f) << 0;
return 3;
} else if (((*str & 0xf8) == 0xf0) && len > 3) {
if ((((*(str+1) & 0xc0) != 0x80) || (*(str+2) & 0xc0) != 0x80) || (*(str+3) & 0xc0) != 0x80) {
return -1;
}
*cp |= (*str++ & 0x7) << 18;
*cp |= (*str++ & 0xf) << 12;
*cp |= (*str++ & 0x1f) << 6;
*cp |= (*str & 0x3f) << 0;
return 4;
}
return -1;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment